diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 68aff793ae6a..77ee313687fc 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -5,11 +5,11 @@ import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB -# Note that we have 400 MiB quota, please use it wisely. -# See https://github.com/pypi/support/issues/3792 . +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB +# Note that we have 800 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/6326 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400)) +VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 7045d8810493..bbed80ebe847 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -8,7 +8,8 @@

Links for vLLM

- {wheel}
+ {x86_wheel}
+ {arm_wheel}
""" @@ -21,7 +22,25 @@ with open("index.html", "w") as f: print(f"Generated index.html for {args.wheel}") + # sync the abi tag with .buildkite/scripts/upload-wheels.sh + if "x86_64" in filename: + x86_wheel = filename + arm_wheel = filename.replace("x86_64", "aarch64").replace( + "manylinux1", "manylinux2014" + ) + elif "aarch64" in filename: + x86_wheel = filename.replace("aarch64", "x86_64").replace( + "manylinux2014", "manylinux1" + ) + arm_wheel = filename + else: + raise ValueError(f"Unsupported wheel: {filename}") # cloudfront requires escaping the '+' character f.write( - template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) + template.format( + x86_wheel=x86_wheel, + x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), + arm_wheel=arm_wheel, + arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), + ) ) diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml new file mode 100644 index 000000000000..ccb4f84201b7 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml @@ -0,0 +1,12 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + # TODO(zhewenl): model card is 0.90, but the actual score is 0.80. + value: 0.80 +limit: 100 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml new file mode 100644 index 000000000000..46f1a9fbf6ff --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -0,0 +1,10 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +tasks: +- name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.80 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml index a2f235f48581..aa4fb9fa03d6 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml @@ -1,4 +1,5 @@ -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 +# For vllm script, with -t option (tensor parallel size) +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" tasks: - name: "gsm8k" diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 000000000000..5f3c31743e75 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 + +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.855 +limit: 2500 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt new file mode 100644 index 000000000000..4fb0b84bc4d8 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt index 27a1a9a82bd3..37eeac85c933 100644 --- a/.buildkite/lm-eval-harness/configs/models-large.txt +++ b/.buildkite/lm-eval-harness/configs/models-large.txt @@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml Mixtral-8x7B-Instruct-v0.1.yaml Qwen2-57B-A14-Instruct.yaml DeepSeek-V2-Lite-Chat.yaml -Meta-Llama-3-8B-QQQ.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt new file mode 100644 index 000000000000..91e22b6459c1 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt new file mode 100644 index 000000000000..1097d220245f --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt @@ -0,0 +1 @@ +Qwen2.5-VL-7B-Instruct.yaml \ No newline at end of file diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh new file mode 100755 index 000000000000..c8db951381b0 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on chartqa for vllm. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.9 + +usage() { + echo`` + echo "Runs lm eval harness on ChartQA using multimodal vllm." + echo "This pathway is intended to be used to create baselines for " + echo "our correctness tests in vllm's CI." + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:l:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm-vlm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ + --tasks chartqa \ + --batch_size auto \ + --apply_chat_template \ + --limit $LIMIT diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh old mode 100644 new mode 100755 index a67fc89d54e6..897f84d1e360 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install lm-eval==0.4.4 +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index b98d42aa7b82..792f355c47a5 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install lm-eval==0.4.4 +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh new file mode 100644 index 000000000000..d85a1721db9a --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on MMLUPRO for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] + +usage() { + echo`` + echo "Runs lm eval harness on MMLU Pro using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ + --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size auto diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index ceea01166b7f..f10de82b1d8e 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -19,21 +19,27 @@ def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get("trust_remote_code", False) max_model_len = eval_config.get("max_model_len", 4096) + batch_size = eval_config.get("batch_size", "auto") + backend = eval_config.get("backend", "vllm") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," f"enforce_eager=true," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," - f"max_model_len={max_model_len}" + f"max_model_len={max_model_len}," ) results = lm_eval.simple_evaluate( - model="vllm", + model=backend, model_args=model_args, tasks=[task["name"] for task in eval_config["tasks"]], num_fewshot=eval_config["num_fewshot"], limit=eval_config["limit"], - batch_size="auto", + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm. + apply_chat_template=backend == "vllm-vlm", + batch_size=batch_size, ) return results diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index cdf6a645147e..e6f5c8b60f45 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm. - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. -See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. +See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. ## Performance benchmark quick overview @@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc ## Trigger the benchmark Performance benchmark will be triggered when: + - A PR being merged into vllm. - Every commit for those PRs with `perf-benchmarks` label AND `ready` label. @@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh ``` Runtime environment variables: + - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). @@ -46,12 +48,14 @@ Runtime environment variables: - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. Nightly benchmark will be triggered when: + - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. +> ### Latency test Here is an example of one test inside `latency-tests.json`: @@ -74,7 +78,7 @@ Here is an example of one test inside `latency-tests.json`: In this example: - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. -- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` +- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. @@ -82,13 +86,13 @@ WARNING: The benchmarking script will save json results by itself, so please do ### Throughput test -The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. +The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`. The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. ### Serving test -We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: +We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: ```json [ @@ -100,7 +104,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -118,8 +121,8 @@ Inside this example: - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. - The `server-parameters` includes the command line arguments for vLLM server. -- The `client-parameters` includes the command line arguments for `benchmark_serving.py`. -- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` +- The `client-parameters` includes the command line arguments for `vllm bench serve`. +- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve` The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. @@ -135,27 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. -`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. +`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. +If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. -Here is an example using the script to compare result_a and result_b without detail test name. -`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name` +Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps. +`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` -| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | -|----|----------------------------------------|----------------------------------------|----------| -| 0 | 142.633982 | 156.526018 | 1.097396 | -| 1 | 241.620334 | 294.018783 | 1.216863 | -| 2 | 218.298905 | 262.664916 | 1.203235 | -| 3 | 242.743860 | 299.816190 | 1.235113 | +| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | +|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------| +| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 | +| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 | -Here is an example using the script to compare result_a and result_b with detail test name. -`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` -| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio | -|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------| -| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 | -| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 | -| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 | -| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 | -| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 | +A comparison diagram will be generated below the table. +Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 +image ## Nightly test details @@ -164,9 +160,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript ### Workflow - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. -- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. -- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. -- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. +- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container. +- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`. +- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. ### Nightly tests @@ -176,6 +172,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. -WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. +WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`. WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md index ef11c040057c..466def07b6f1 100644 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ b/.buildkite/nightly-benchmarks/nightly-annotation.md @@ -1,3 +1,4 @@ +# Nightly benchmark annotation ## Description @@ -13,15 +14,15 @@ Please download the visualization scripts in the post - Find the docker we use in `benchmarking pipeline` - Deploy the docker, and inside the docker: - - Download `nightly-benchmarks.zip`. - - In the same folder, run the following code: - - ```bash - export HF_TOKEN= - apt update - apt install -y git - unzip nightly-benchmarks.zip - VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - ``` + - Download `nightly-benchmarks.zip`. + - In the same folder, run the following code: + + ```bash + export HF_TOKEN= + apt update + apt install -y git + unzip nightly-benchmarks.zip + VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh + ``` And the results will be inside `./benchmarks/results`. diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 5f003f42f07c..2ef36089b6af 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -8,30 +8,30 @@ This benchmark aims to: Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. -Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) +Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) ## Setup - Docker images: - - vLLM: `vllm/vllm-openai:v0.6.2` - - SGLang: `lmsysorg/sglang:v0.3.2-cu121` - - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` - - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` - - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* - - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. + - vLLM: `vllm/vllm-openai:v0.6.2` + - SGLang: `lmsysorg/sglang:v0.3.2-cu121` + - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` + - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` + - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.* + - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. - Hardware - - 8x Nvidia A100 GPUs + - 8x Nvidia A100 GPUs - Workload: - - Dataset - - ShareGPT dataset - - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) - - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) - - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. - - Models: llama-3 8B, llama-3 70B. - - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). - - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. - - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. - - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). + - Dataset + - ShareGPT dataset + - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) + - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) + - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. + - Models: llama-3 8B, llama-3 70B. + - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). + - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. + - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. + - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). ## Known issues diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md index a1f8441ccdac..8bb16bd3cf37 100644 --- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md @@ -1,3 +1,4 @@ +# Performance benchmarks descriptions ## Latency tests diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 20c106234935..5ea5a50a258a 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -1,33 +1,202 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import json +import os +from importlib import util import pandas as pd +plotly_found = util.find_spec("plotly.express") is not None + def compare_data_columns( - files, name_column, data_column, drop_column, ignore_test_name=False + files, name_column, data_column, info_cols, drop_column, debug=False ): - print("\ncompare_data_column: " + data_column) + """ + Align concatenation by keys derived from info_cols instead of row order. + - Pick one canonical key list: subset of info_cols present in ALL files. + - For each file: set index to those keys, aggregate duplicates + - (mean for metric, first for names). + - Concat along axis=1 (indexes align), then reset_index so callers can + - group by columns. + - If --debug, add a _name column per file. + """ + print("\ncompare_data_column:", data_column) + frames = [] + raw_data_cols = [] compare_frames = [] + + # 1) choose a canonical key list from info_cols that exists in ALL files + cols_per_file = [] + for f in files: + try: + df_tmp = pd.read_json(f, orient="records") + except Exception as err: + raise ValueError(f"Failed to read {f}") from err + cols_per_file.append(set(df_tmp.columns)) + + key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] + if not key_cols: + # soft fallback: use any info_cols present in the first file + key_cols = [c for c in info_cols if c in list(cols_per_file[0])] + if not key_cols: + raise ValueError( + "No common key columns found from info_cols across the input files." + ) + + # 2) build a single "meta" block (keys as columns) once, aligned by the key index + meta_added = False + for file in files: - data_df = pd.read_json(file) - serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) - if ignore_test_name is False: - serving_df = serving_df.rename(columns={name_column: file + "_name"}) - frames.append(serving_df[file + "_name"]) - serving_df = serving_df.rename(columns={data_column: file}) - frames.append(serving_df[file]) - compare_frames.append(serving_df[file]) + df = pd.read_json(file, orient="records") + + # Keep rows that actually have the compared metric (same as original behavior) + if drop_column in df.columns: + df = df.dropna(subset=[drop_column], ignore_index=True) + + # Stabilize numeric key columns (harmless if missing) + for c in ( + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", + ): + if c in df.columns: + df[c] = pd.to_numeric(df[c], errors="coerce") + + # Ensure all key columns exist + for c in key_cols: + if c not in df.columns: + df[c] = pd.NA + + # Set index = key_cols and aggregate duplicates → unique MultiIndex + df_idx = df.set_index(key_cols, drop=False) + + # meta (key columns), unique per key + meta = df_idx[key_cols] + if not meta.index.is_unique: + meta = meta.groupby(level=key_cols, dropna=False).first() + + # metric series for this file, aggregated to one row per key + file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) + s = df_idx[data_column] + if not s.index.is_unique: + s = s.groupby(level=key_cols, dropna=False).mean() + s.name = file_label # column label like original + + # add meta once (from first file) so keys are the leftmost columns + if not meta_added: + frames.append(meta) + meta_added = True + + # (NEW) debug: aligned test-name column per file + if debug and name_column in df_idx.columns: + name_s = df_idx[name_column] + if not name_s.index.is_unique: + name_s = name_s.groupby(level=key_cols, dropna=False).first() + name_s.name = f"{file_label}_name" + frames.append(name_s) + + frames.append(s) + raw_data_cols.append(file_label) + compare_frames.append(s) + + # Generalize ratio: for any file N>=2, add ratio (fileN / file1) if len(compare_frames) >= 2: - # Compare numbers among two files - ratio_df = compare_frames[1] / compare_frames[0] - frames.append(ratio_df) - compare_frames.pop(1) + base = compare_frames[0] + current = compare_frames[-1] + ratio = current / base + ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 + ratio.name = f"Ratio 1 vs {len(compare_frames)}" + frames.append(ratio) + # 4) concat on columns with aligned MultiIndex; + # then reset_index to return keys as columns concat_df = pd.concat(frames, axis=1) - return concat_df + concat_df = concat_df.reset_index(drop=True).reset_index() + if "index" in concat_df.columns: + concat_df = concat_df.drop(columns=["index"]) + + # Ensure key/info columns appear first (in your info_cols order) + front = [c for c in info_cols if c in concat_df.columns] + rest = [c for c in concat_df.columns if c not in front] + concat_df = concat_df[front + rest] + + print(raw_data_cols) + return concat_df, raw_data_cols + + +def split_json_by_tp_pp( + input_file: str = "benchmark_results.json", output_root: str = "." +) -> list[str]: + """ + Split a benchmark JSON into separate folders by (TP Size, PP Size). + + Creates: /tp{TP}_pp{PP}/benchmark_results.json + Returns: list of file paths written. + """ + # Load JSON data into DataFrame + with open(input_file, encoding="utf-8") as f: + data = json.load(f) + + # If the JSON is a dict with a list under common keys, use that list + if isinstance(data, dict): + for key in ("results", "serving_results", "benchmarks", "data"): + if isinstance(data.get(key), list): + data = data[key] + break + + df = pd.DataFrame(data) + + # Keep only "serving" tests + name_col = next( + (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None + ) + if name_col: + df = df[ + df[name_col].astype(str).str.contains(r"serving", case=False, na=False) + ].copy() + + # Handle alias column names + rename_map = { + "tp_size": "TP Size", + "tensor_parallel_size": "TP Size", + "pp_size": "PP Size", + "pipeline_parallel_size": "PP Size", + } + df.rename( + columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True + ) + + # Ensure TP/PP columns exist (default to 1 if missing) + if "TP Size" not in df.columns: + df["TP Size"] = 1 + if "PP Size" not in df.columns: + df["PP Size"] = 1 + + # make sure TP/PP are numeric ints with no NaN + df["TP Size"] = ( + pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int) + ) + df["PP Size"] = ( + pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int) + ) + + # Split into separate folders + saved_paths: list[str] = [] + for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): + folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") + os.makedirs(folder_name, exist_ok=True) + filepath = os.path.join(folder_name, "benchmark_results.json") + group_df.to_json(filepath, orient="records", indent=2, force_ascii=False) + print(f"Saved: {filepath}") + saved_paths.append(filepath) + + return saved_paths if __name__ == "__main__": @@ -36,31 +205,103 @@ def compare_data_columns( "-f", "--file", action="append", type=str, help="input file name" ) parser.add_argument( - "--ignore_test_name", action="store_true", help="ignore_test_name or not" + "--debug", action="store_true", help="show all information for debugging" + ) + parser.add_argument( + "--plot", + action=argparse.BooleanOptionalAction, + default=True, + help="plot perf diagrams or not --no-plot --plot", + ) + parser.add_argument( + "-x", + "--xaxis", + type=str, + default="# of max concurrency.", + help="column name to use as X Axis in comparison graph", ) args = parser.parse_args() - files = args.file - print("comparing : " + ", ".join(files)) drop_column = "P99" name_column = "Test name" + info_cols = [ + "Model", + "Dataset Name", + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", + ] data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] html_msgs_for_data_cols = [ "Compare Output Tokens /n", "Median TTFT /n", "Median TPOT /n", ] - ignore_test_name = args.ignore_test_name + + if len(args.file) == 1: + files = split_json_by_tp_pp(args.file[0], output_root="splits") + info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] + else: + files = args.file + print("comparing : " + ", ".join(files)) + debug = args.debug + plot = args.plot + # For Plot feature, assign y axis from one of info_cols + y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 with open("perf_comparison.html", "w") as text_file: for i in range(len(data_cols_to_compare)): - output_df = compare_data_columns( + output_df, raw_data_cols = compare_data_columns( files, name_column, data_cols_to_compare[i], + info_cols, drop_column, - ignore_test_name=ignore_test_name, + debug=debug, ) - print(output_df) - html = output_df.to_html() - text_file.write(html_msgs_for_data_cols[i]) - text_file.write(html) + + # For Plot feature, insert y axis from one of info_cols + raw_data_cols.insert(0, info_cols[y_axis_index]) + + filtered_info_cols = info_cols[:-2] + existing_group_cols = [ + c for c in filtered_info_cols if c in output_df.columns + ] + if not existing_group_cols: + raise ValueError( + f"No valid group-by columns " + f"Expected subset: {filtered_info_cols}, " + f"but DataFrame has: {list(output_df.columns)}" + ) + output_df_sorted = output_df.sort_values(by=existing_group_cols) + output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) + for name, group in output_groups: + html = group.to_html() + text_file.write(html_msgs_for_data_cols[i]) + text_file.write(html) + + if plot and plotly_found: + import plotly.express as px + + df = group[raw_data_cols] + df_sorted = df.sort_values(by=info_cols[y_axis_index]) + # Melt DataFrame for plotting + df_melted = df_sorted.melt( + id_vars=info_cols[y_axis_index], + var_name="Configuration", + value_name=data_cols_to_compare[i], + ) + title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + # Create Plotly line chart + fig = px.line( + df_melted, + x=info_cols[y_axis_index], + y=data_cols_to_compare[i], + color="Configuration", + title=title, + markers=True, + ) + # Export to HTML + text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 724b53056ca8..a655a650cb32 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,17 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse import json import os +import shlex from importlib import util from pathlib import Path +from typing import Any import pandas as pd import psutil +import regex as re from tabulate import tabulate -results_folder = Path("results/") - # latency results and the keys that will be printed into markdown latency_results = [] latency_column_mapping = { @@ -42,13 +44,22 @@ serving_results = [] serving_column_mapping = { "test_name": "Test name", + "model_id": "Model", + "dataset_name": "Dataset Name", + "input_len": "Input Len", + "output_len": "Output Len", + "tp_size": "TP Size", + "pp_size": "PP Size", + "dtype": "dtype", "gpu_type": "GPU", "completed": "# of req.", + "qps": "qps", + "max_concurrency": "# of max concurrency.", "request_throughput": "Tput (req/s)", "total_token_throughput": "Total Token Tput (tok/s)", "output_throughput": "Output Tput (tok/s)", - "total_input_tokens": "Total input tokens", - "total_output_tokens": "Total output tokens", + # "total_input_tokens": "Total input tokens", + # "total_output_tokens": "Total output tokens", "mean_ttft_ms": "Mean TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)", @@ -93,15 +104,111 @@ def get_size_with_unit(bytes, suffix="B"): bytes /= factor +def _coerce(val: str) -> Any: + """Best-effort type coercion from string to Python types.""" + low = val.lower() + if low == "null": + return None + if low == "true": + return True + if low == "false": + return False + # integers + if re.fullmatch(r"[+-]?\d+", val): + try: + return int(val) + except ValueError: + pass + # floats (keep 'inf'/'-inf'/'nan' as strings) + if re.fullmatch(r"[+-]?\d*\.\d+", val): + try: + return float(val) + except ValueError: + pass + return val + + +def parse_client_command(cmd: str) -> dict[str, Any]: + """Parse the client_command shell string into {executable, script, args}.""" + toks = shlex.split(cmd) + if len(toks) < 2: + raise ValueError("client_command must include an executable and a script") + executable, script = toks[0], toks[1] + args: dict[str, Any] = {} + + i = 2 + while i < len(toks): + t = toks[i] + if t.startswith("--"): + # --key=value or --key (value) or boolean flag + if "=" in t: + key, val = t.split("=", 1) + if key == "--metadata": + md = {} + if val: + if "=" in val: + k, v = val.split("=", 1) + md[k] = _coerce(v) + else: + md[val] = True + args[key] = md + else: + args[key] = _coerce(val) + i += 1 + continue + + key = t + + # Special: consume metadata k=v pairs until next --flag + if key == "--metadata": + i += 1 + md = {} + while i < len(toks) and not toks[i].startswith("--"): + pair = toks[i] + if "=" in pair: + k, v = pair.split("=", 1) + md[k] = _coerce(v) + else: + md[pair] = True + i += 1 + args[key] = md + continue + + # Standard: check if next token is a value (not a flag) + if i + 1 < len(toks) and not toks[i + 1].startswith("--"): + args[key] = _coerce(toks[i + 1]) + i += 2 + else: + # lone flag -> True + args[key] = True + i += 1 + else: + # unexpected positional; skip + i += 1 + + return {"executable": executable, "script": script, "args": args} + + if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-r", + "--result", + type=str, + default="results", + help="Folder name for benchmark output results.", + ) + args = parser.parse_args() + results_folder = Path(args.result) + if not results_folder.exists(): + raise FileNotFoundError(f"results folder does not exist: {results_folder}") # collect results for test_file in results_folder.glob("*.json"): with open(test_file) as f: raw_result = json.loads(f.read()) if "serving" in str(test_file): - # this result is generated via `benchmark_serving.py` - + # this result is generated via `vllm bench serve` command # attach the benchmarking command to raw_result try: with open(test_file.with_suffix(".commands")) as f: @@ -109,18 +216,50 @@ def get_size_with_unit(bytes, suffix="B"): except OSError as e: print(e) continue - + # Parse Server Command Arg + out: dict[str, Any] = { + "server_command": parse_client_command(command["server_command"]) + } + parse_args = [ + "--tensor-parallel-size", + "--pipeline-parallel-size", + "--dtype", + ] + col_mapping = ["tp_size", "pp_size", "dtype"] + for index, arg in enumerate(parse_args): + if arg in out["server_command"]["args"]: + raw_result.update( + {col_mapping[index]: out["server_command"]["args"][arg]} + ) + + # Parse Client Command Arg + out: dict[str, Any] = { + "client_command": parse_client_command(command["client_command"]) + } + parse_args = [ + "--dataset-name", + "--random-input-len", + "--random-output-len", + "--request-rate", + ] + col_mapping = ["dataset_name", "input_len", "output_len", "qps"] + + for index, arg in enumerate(parse_args): + if arg in out["client_command"]["args"]: + raw_result.update( + {col_mapping[index]: out["client_command"]["args"][arg]} + ) + # Add Server, Client command raw_result.update(command) # update the test name of this result raw_result.update({"test_name": test_file.stem}) - # add the result to raw_result serving_results.append(raw_result) continue elif "latency" in f.name: - # this result is generated via `benchmark_latency.py` + # this result is generated via `vllm bench latency` command # attach the benchmarking command to raw_result try: @@ -148,7 +287,7 @@ def get_size_with_unit(bytes, suffix="B"): continue elif "throughput" in f.name: - # this result is generated via `benchmark_throughput.py` + # this result is generated via `vllm bench throughput` command # attach the benchmarking command to raw_result try: @@ -204,7 +343,10 @@ def get_size_with_unit(bytes, suffix="B"): columns=latency_column_mapping ) if not serving_results.empty: - serving_results = serving_results[list(serving_column_mapping.keys())].rename( + valid_columns = [ + col for col in serving_column_mapping if col in serving_results.columns + ] + serving_results = serving_results[valid_columns].rename( columns=serving_column_mapping ) if not throughput_results.empty: @@ -226,7 +368,7 @@ def get_size_with_unit(bytes, suffix="B"): # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # we want to turn it into "8xGPUTYPE" df["GPU"] = df["GPU"].apply( - lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}" + lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}" ) # get markdown tables @@ -244,7 +386,9 @@ def get_size_with_unit(bytes, suffix="B"): ) # document the result - with open(results_folder / "benchmark_results.md", "w") as f: + md_file = "benchmark_results.md" + json_file = "benchmark_results.json" + with open(results_folder / md_file, "w") as f: results = read_markdown( "../.buildkite/nightly-benchmarks/" + "performance-benchmarks-descriptions.md" @@ -259,7 +403,7 @@ def get_size_with_unit(bytes, suffix="B"): f.write(results) # document benchmarking results in json - with open(results_folder / "benchmark_results.json", "w") as f: + with open(results_folder / json_file, "w") as f: results = ( latency_results.to_dict(orient="records") + throughput_results.to_dict(orient="records") diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh index fb5063db8694..ebacdcbd6821 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -181,18 +181,14 @@ launch_vllm_server() { if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="vllm serve $model \ -tp $tp \ - --model $model \ --port $port \ $server_args" else echo "Key 'fp8' does not exist in common params." - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="vllm serve $model \ -tp $tp \ - --model $model \ --port $port \ $server_args" fi diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 4d01a314adc4..a00de940cbbb 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -73,7 +73,7 @@ get_current_llm_serving_engine() { echo "Container: vllm" # move to a completely irrelevant directory, to avoid import vllm from current folder export CURRENT_LLM_SERVING_ENGINE=vllm - + return fi } @@ -95,12 +95,14 @@ json2args() { } kill_gpu_processes() { - pkill -f python - pkill -f python3 - pkill -f tritonserver - pkill -f pt_main_thread - pkill -f text-generation - pkill -f lmdeploy + pkill -f '[p]ython' + pkill -f '[p]ython3' + pkill -f '[t]ritonserver' + pkill -f '[p]t_main_thread' + pkill -f '[t]ext-generation' + pkill -f '[l]mdeploy' + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pkill -f '[V]LLM' while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 @@ -125,7 +127,7 @@ ensure_installed() { } run_serving_tests() { - # run serving tests using `benchmark_serving.py` + # run serving tests using `vllm bench serve` command # $1: a json file specifying serving test cases local serving_test_file @@ -225,7 +227,7 @@ run_serving_tests() { if [[ "$dataset_name" = "sharegpt" ]]; then - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ @@ -246,7 +248,7 @@ run_serving_tests() { sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') - client_command="python3 benchmark_serving.py \ + client_command="vllm bench serve \ --backend $backend \ --tokenizer /tokenizer_cache \ --model $model \ @@ -265,13 +267,13 @@ run_serving_tests() { $client_args" else - + echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." exit 1 fi - + echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" @@ -302,7 +304,7 @@ run_serving_tests() { } run_genai_perf_tests() { - # run genai-perf tests + # run genai-perf tests # $1: a json file specifying genai-perf test cases local genai_perf_test_file @@ -311,14 +313,14 @@ run_genai_perf_tests() { # Iterate over genai-perf tests jq -c '.[]' "$genai_perf_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - + test_name=$(echo "$params" | jq -r '.test_name') + # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." continue fi - + # prepend the current serving engine to the test name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} @@ -369,10 +371,10 @@ run_genai_perf_tests() { qps=$num_prompts echo "now qps is $qps" fi - + new_test_name=$test_name"_qps_"$qps backend=$CURRENT_LLM_SERVING_ENGINE - + if [[ "$backend" == *"vllm"* ]]; then backend="vllm" fi @@ -380,7 +382,7 @@ run_genai_perf_tests() { client_command="genai-perf profile \ -m $model \ --service-kind openai \ - --backend vllm \ + --backend "$backend" \ --endpoint-type chat \ --streaming \ --url localhost:$port \ @@ -413,7 +415,7 @@ prepare_dataset() { do cat sonnet.txt >> sonnet_4x.txt done - + } main() { diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index f05040618981..c64e5638029e 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -33,7 +33,7 @@ check_gpus() { check_cpus() { # check the number of CPUs and NUMA Node and GPU type. - declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)") + declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') if [[ $numa_count -gt 0 ]]; then echo "NUMA found." echo $numa_count @@ -126,7 +126,8 @@ kill_gpu_processes() { ps -aux lsof -t -i:8000 | xargs -r kill -9 pgrep python3 | xargs -r kill -9 - + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 # wait until GPU memory usage smaller than 1GB if command -v nvidia-smi; then @@ -164,7 +165,7 @@ upload_to_buildkite() { } run_latency_tests() { - # run latency tests using `benchmark_latency.py` + # run latency tests using `vllm bench latency` command # $1: a json file specifying latency test cases local latency_test_file @@ -193,9 +194,11 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -205,7 +208,7 @@ run_latency_tests() { fi fi - latency_command=" $latency_envs python3 benchmark_latency.py \ + latency_command=" $latency_envs vllm bench latency \ --output-json $RESULTS_FOLDER/${test_name}.json \ $latency_args" @@ -231,7 +234,7 @@ run_latency_tests() { } run_throughput_tests() { - # run throughput tests using `benchmark_throughput.py` + # run throughput tests using `vllm bench throughput` # $1: a json file specifying throughput test cases local throughput_test_file @@ -260,9 +263,11 @@ run_throughput_tests() { # check if there is enough GPU to run the test tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -272,7 +277,7 @@ run_throughput_tests() { fi fi - throughput_command=" $throughput_envs python3 benchmark_throughput.py \ + throughput_command=" $throughput_envs vllm bench throughput \ --output-json $RESULTS_FOLDER/${test_name}.json \ $throughput_args" @@ -297,7 +302,7 @@ run_throughput_tests() { } run_serving_tests() { - # run serving tests using `benchmark_serving.py` + # run serving tests using `vllm bench serve` command # $1: a json file specifying serving test cases local serving_test_file @@ -328,12 +333,21 @@ run_serving_tests() { qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" + max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') + if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then + num_prompts=$(echo "$client_params" | jq -r '.num_prompts') + max_concurrency_list="[$num_prompts]" + fi + max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') + echo "Running over max concurrency list $max_concurrency_list" # check if there is enough resources to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -351,8 +365,7 @@ run_serving_tests() { continue fi - server_command="$server_envs python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="$server_envs vllm serve \ $server_args" # run the server @@ -389,35 +402,39 @@ run_serving_tests() { echo "now qps is $qps" fi - new_test_name=$test_name"_qps_"$qps - - # pass the tensor parallel size to the client so that it can be displayed - # on the benchmark dashboard - client_command="python3 benchmark_serving.py \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --metadata "tensor_parallel_size=$tp" \ - $client_args $client_remote_args " - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - bash -c "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - + # iterate over different max_concurrency + for max_concurrency in $max_concurrency_list; do + new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency + echo " new test name $new_test_name" + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --max-concurrency $max_concurrency \ + --metadata "tensor_parallel_size=$tp" \ + $client_args $client_remote_args " + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done done # clean up @@ -437,17 +454,12 @@ main() { fi check_hf_token - # Set to v1 to run v1 benchmark - if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then - export VLLM_USE_V1=1 - fi - # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) (which lsof) || (apt-get update && apt-get install -y lsof) - # get the current IP address, required by benchmark_serving.py + # get the current IP address, required by `vllm bench serve` command export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') # turn of the reporting of the status of each request, to clean up the terminal output export VLLM_LOGGING_LEVEL="WARNING" diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json index edbe9f2df0ce..afb844880f9f 100644 --- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -11,9 +11,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json index da93fdd1dbac..569117aae852 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json @@ -6,7 +6,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "num_iters_warmup": 5, @@ -20,7 +20,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "num_iters_warmup": 5, diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index fda1a7a3ec53..423a3bfe1267 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -35,9 +35,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -90,9 +88,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -145,9 +141,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -197,9 +191,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -251,9 +243,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, @@ -305,9 +295,7 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, - "num_scheduler_steps": 10, "max_num_seqs": 512, "dtype": "bfloat16" }, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json new file mode 100644 index 000000000000..f758097e098e --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -0,0 +1,610 @@ +[ + { + "test_name": "serving_llama8B_bf16_tp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_tp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_bf16_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_bf16_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json new file mode 100644 index 000000000000..ce396d6e54f2 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -0,0 +1,820 @@ +[ + { + "test_name": "serving_llama8B_bf16_pp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_bf16_pp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_bf16_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_bf16_pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_pp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int8_pp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_pp1_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_int4_pp1_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, + { + "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 2, + "pipeline_parallel_size": 3, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index 22f71c993ff3..e21c8df0a9fe 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -2,104 +2,112 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp2_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_random_1024_128", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -107,32 +115,34 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 1024, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 100, "num_prompts": 100 } }, { "test_name": "serving_llama8B_pp6_random_1024_128", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 6, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -140,18 +150,18 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 1024, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 100, "num_prompts": 100 } } diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index 13fd5aa8db97..a6d4141d5c2d 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -7,7 +7,6 @@ "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -26,7 +25,6 @@ "tensor_parallel_size": 4, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -45,7 +43,6 @@ "tensor_parallel_size": 2, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -60,8 +57,7 @@ "test_name": "serving_llama70B_tp4_sharegpt_specdecode", "qps_list": [2], "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "disable_log_requests": "", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "swap_space": 16, "speculative_config": { diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json index f159c30637d3..48c015aa8403 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json @@ -6,7 +6,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -21,7 +21,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml deleted file mode 100644 index d5cad1c73c6f..000000000000 --- a/.buildkite/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -# This local pyproject file is part of the migration from yapf to ruff format. -# It uses the same core rules as the main pyproject.toml file, but with the -# following differences: -# - ruff line length is overridden to 88 -# - deprecated typing ignores (UP006, UP035) have been removed - -[tool.ruff] -line-length = 88 - -[tool.ruff.lint.per-file-ignores] -"vllm/third_party/**" = ["ALL"] -"vllm/version.py" = ["F401"] -"vllm/_version.py" = ["ALL"] - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - "I", - # flake8-logging-format - "G", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # Loop control variable not used within loop body - "B007", - # f-string format - "UP032", - # Can remove once 3.10+ is the minimum Python version - "UP007", -] - -[tool.ruff.format] -docstring-code-format = true diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 6314afd65234..5bc59c151565 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,5 +1,36 @@ steps: + # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 + - label: "Build arm64 wheel - CUDA 12.9" + depends_on: ~ + id: build-wheel-arm64-cuda-12-9 + agents: + queue: arm64_cpu_queue_postmerge + commands: + # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: + # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + + # aarch64 build. + - label: "Build arm64 CPU wheel" + depends_on: ~ + id: build-wheel-arm64-cpu + agents: + queue: arm64_cpu_queue_postmerge + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + - label: "Build wheel - CUDA 12.8" + depends_on: ~ id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge @@ -12,6 +43,7 @@ steps: DOCKER_BUILDKIT: "1" - label: "Build wheel - CUDA 12.6" + depends_on: ~ id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge @@ -23,44 +55,61 @@ steps: env: DOCKER_BUILDKIT: "1" - # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. - # However, this block can be uncommented to save some compute hours. - # - block: "Build CUDA 11.8 wheel" - # key: block-build-cu118-wheel - - - label: "Build wheel - CUDA 11.8" - # depends_on: block-build-cu118-wheel - id: build-wheel-cuda-11-8 + # x86 + CUDA builds + - label: "Build wheel - CUDA 12.9" + depends_on: ~ + id: build-wheel-cuda-12-9 agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - block: "Build release image" + - label: "Build release image (x86)" depends_on: ~ - key: block-release-image-build - - - label: "Build release image" - depends_on: block-release-image-build - id: build-release-image + id: build-release-image-x86 agents: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + # re-tag to default image tag and push, just in case arm64 build fails + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 + - label: "Build release image (arm64)" + depends_on: ~ + id: build-release-image-arm64 + agents: + queue: arm64_cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" + + # Add job to create multi-arch manifest + - label: "Create multi-arch manifest" + depends_on: + - build-release-image-x86 + - build-release-image-arm64 + id: create-multi-arch-manifest + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" + - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - label: "Annotate release workflow" depends_on: - - build-release-image + - create-multi-arch-manifest - build-wheel-cuda-12-8 - - build-wheel-cuda-12-6 - - build-wheel-cuda-11-8 id: annotate-release-workflow agents: queue: cpu_queue_postmerge @@ -107,18 +156,46 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build Neuron release image" - key: block-neuron-release-image-build + - block: "Build arm64 CPU release image" + key: block-arm64-cpu-release-image-build depends_on: ~ - - label: "Build and publish Neuron release image" - depends_on: block-neuron-release-image-build + - label: "Build and publish arm64 CPU release image" + depends_on: block-arm64-cpu-release-image-build agents: - queue: neuron-postmerge + queue: arm64_cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest" - - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest" + - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)" + env: + DOCKER_BUILDKIT: "1" + + - label: "Build and publish nightly multi-arch image to DockerHub" + depends_on: + - create-multi-arch-manifest + if: build.env("NIGHTLY") == "1" + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64" + - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64" + - "docker push vllm/vllm-openai:nightly-x86_64" + - "docker push vllm/vllm-openai:nightly-aarch64" + - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" + - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" + - "docker manifest push vllm/vllm-openai:nightly" + - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" + # Clean up old nightly builds (keep only last 14) + - "bash .buildkite/scripts/cleanup-nightly-builds.sh" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN env: DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: "vllmbot" diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 94e0ac2398f3..fde48603ad3c 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF To download the wheel: \`\`\` aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . + aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: \`\`\` -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai -docker tag vllm/vllm-openai vllm/vllm-openai:latest -docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} -docker push vllm/vllm-openai:latest -docker push vllm/vllm-openai:v${RELEASE_VERSION} +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +docker push vllm/vllm-openai:latest-x86_64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker push vllm/vllm-openai:latest-aarch64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 + +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest push vllm/vllm-openai:latest +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` EOF \ No newline at end of file diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh new file mode 100755 index 000000000000..f02a128c6772 --- /dev/null +++ b/.buildkite/scripts/cleanup-nightly-builds.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +set -ex + +# Clean up old nightly builds from DockerHub, keeping only the last 14 builds +# This script uses DockerHub API to list and delete old tags with "nightly-" prefix + +# DockerHub API endpoint for vllm/vllm-openai repository +REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" + +# Get DockerHub credentials from environment +if [ -z "$DOCKERHUB_TOKEN" ]; then + echo "Error: DOCKERHUB_TOKEN environment variable is not set" + exit 1 +fi + +if [ -z "$DOCKERHUB_USERNAME" ]; then + echo "Error: DOCKERHUB_USERNAME environment variable is not set" + exit 1 +fi + +# Get DockerHub bearer token +echo "Getting DockerHub bearer token..." +set +x +BEARER_TOKEN=$(curl -s -X POST \ + -H "Content-Type: application/json" \ + -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \ + "https://hub.docker.com/v2/users/login" | jq -r '.token') +set -x + +if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then + echo "Error: Failed to get DockerHub bearer token" + exit 1 +fi + +# Function to get all tags from DockerHub +get_all_tags() { + local page=1 + local all_tags="" + + while true; do + set +x + local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \ + "$REPO_API_URL?page=$page&page_size=100") + set -x + + # Get both last_updated timestamp and tag name, separated by | + local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') + + if [ -z "$tags" ]; then + break + fi + + all_tags="$all_tags$tags"$'\n' + page=$((page + 1)) + done + + # Sort by timestamp (newest first) and extract just the tag names + echo "$all_tags" | sort -r | cut -d'|' -f2 +} + +delete_tag() { + local tag_name="$1" + echo "Deleting tag: $tag_name" + + local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" + set +x + local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") + set -x + + if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then + echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" + else + echo "Successfully deleted tag: $tag_name" + fi +} + +# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first) +echo "Fetching all tags from DockerHub..." +all_tags=$(get_all_tags) + +if [ -z "$all_tags" ]; then + echo "No tags found to clean up" + exit 0 +fi + +# Count total tags +total_tags=$(echo "$all_tags" | wc -l) +echo "Found $total_tags tags" + +# Keep only the last 14 builds (including the current one) +tags_to_keep=14 +tags_to_delete=$((total_tags - tags_to_keep)) + +if [ $tags_to_delete -le 0 ]; then + echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)" + exit 0 +fi + +echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep" + +# Get tags to delete (skip the first $tags_to_keep tags) +tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1))) + +if [ -z "$tags_to_delete_list" ]; then + echo "No tags to delete" + exit 0 +fi + +# Delete old tags +echo "Deleting old tags..." +while IFS= read -r tag; do + if [ -n "$tag" ]; then + delete_tag "$tag" + # Add a small delay to avoid rate limiting + sleep 1 + fi +done <<< "$tags_to_delete_list" + +echo "Cleanup completed successfully" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 5e5a532cb57d..aa4cc7b35a54 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} fi -if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then - commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"} -fi - if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi @@ -121,7 +117,6 @@ fi if [[ $commands == *" kernels/quantization"* ]]; then commands="${commands} \ --ignore=kernels/quantization/test_int8_quant.py \ - --ignore=kernels/quantization/test_aqlm.py \ --ignore=kernels/quantization/test_machete_mm.py \ --ignore=kernels/quantization/test_block_fp8.py \ --ignore=kernels/quantization/test_block_int8.py \ @@ -165,16 +160,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_chat.py \ --ignore=entrypoints/llm/test_accuracy.py \ --ignore=entrypoints/llm/test_init.py \ - --ignore=entrypoints/llm/test_generate_multiple_loras.py \ --ignore=entrypoints/llm/test_prompt_validation.py "} fi -#Obsolete currently -##ignore certain Entrypoints/llm tests -#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then -# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} -#fi - # --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_oot_registration.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 36bcb015d308..39ea18017308 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -25,25 +25,28 @@ function cpu_tests() { # offline inference podman exec -it "$container_id" bash -c " - set -e - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + set -xve + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " - set -e + set -evx pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib pip install sentence-transformers datamodel_code_generator - pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model + + # Note: disable Bart until supports V1 + # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] - pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" + # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. + # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log } # All of CPU tests are expected to be finished less than 40 mins. export container_id export -f cpu_tests -timeout 40m bash -c cpu_tests +timeout 120m bash -c cpu_tests diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 90cc9c844622..7927aef19e4e 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1} export CMAKE_BUILD_PARALLEL_LEVEL=32 # Setup cleanup -remove_docker_container() { - set -e; - docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; } trap remove_docker_container EXIT remove_docker_container @@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -46,57 +46,74 @@ function cpu_tests() { set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -v -s tests/kernels/test_onednn.py" + # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e # Note: disable until supports V1 - # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model - # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model - - # Note: disable Bart until supports V1 - pytest -v -s tests/models/language/generation -m cpu_model \ - --ignore=tests/models/language/generation/test_bart.py - VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ - --ignore=tests/models/language/generation/test_bart.py - - pytest -v -s tests/models/language/pooling -m cpu_model - pytest -v -s tests/models/multimodal/generation \ - --ignore=tests/models/multimodal/generation/test_mllama.py \ + # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model + # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + + pytest -x -v -s tests/models/language/generation -m cpu_model + VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model + + pytest -x -v -s tests/models/language/pooling -m cpu_model + pytest -x -v -s tests/models/multimodal/generation \ --ignore=tests/models/multimodal/generation/test_pixtral.py \ -m cpu_model" # Run compressed-tensor test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" + pytest -x -s -v \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" # Note: disable it until supports V1 # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e - # VLLM_USE_V1=0 pytest -s -v \ + # VLLM_USE_V1=0 pytest -x -s -v \ # tests/quantization/test_ipex_quant.py" - # online serving + # Run multi-lora tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -s -v \ + tests/lora/test_qwen2vl.py" + + # online serving: tp+pp docker exec cpu-test-"$NUMA_NODE" bash -c ' set -e VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & + server_pid=$! timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 - python3 benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ - --endpoint /v1/completions' + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' - # Run multi-lora tests - docker exec cpu-test-"$NUMA_NODE" bash -c " + # online serving: tp+dp + docker exec cpu-test-"$NUMA_NODE" bash -c ' set -e - pytest -s -v \ - tests/lora/test_qwen2vl.py" + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' } # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh index 8c64e14606d3..f69e4b06680f 100644 --- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh +++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh @@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ --build-arg RUN_WHEEL_CHECK=false \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" + --build-arg torch_cuda_arch_list="9.0+PTX" # Setup cleanup remove_docker_container() { docker rm -f gh200-test || true; } diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh deleted file mode 100644 index a397457c8326..000000000000 --- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# This script build the Neuron docker image and run the API server inside the container. -# It serves a sanity check for compilation and basic model usage. -set -e -set -v - -image_name="neuron/vllm-ci" -container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" - -HF_CACHE="$(realpath ~)/huggingface" -mkdir -p "${HF_CACHE}" -HF_MOUNT="/root/.cache/huggingface" -HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN) - -NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" -mkdir -p "${NEURON_COMPILE_CACHE_URL}" -NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" - -# Try building the docker image -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws - -# prune old image and containers to save disk space, and only once a day -# by using a timestamp file in tmp. -if [ -f /tmp/neuron-docker-build-timestamp ]; then - last_build=$(cat /tmp/neuron-docker-build-timestamp) - current_time=$(date +%s) - if [ $((current_time - last_build)) -gt 86400 ]; then - # Remove dangling images (those that are not tagged and not used by any container) - docker image prune -f - # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune -f - echo "$current_time" > /tmp/neuron-docker-build-timestamp - fi -else - date "+%s" > /tmp/neuron-docker-build-timestamp -fi - -docker build -t "${image_name}" -f docker/Dockerfile.neuron . - -# Setup cleanup -remove_docker_container() { - docker image rm -f "${image_name}" || true; -} -trap remove_docker_container EXIT - -# Run the image -docker run --rm -it --device=/dev/neuron0 --network bridge \ - -v "${HF_CACHE}:${HF_MOUNT}" \ - -e "HF_HOME=${HF_MOUNT}" \ - -e "HF_TOKEN=${HF_TOKEN}" \ - -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ - -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ - --name "${container_name}" \ - ${image_name} \ - /bin/bash -c " - set -e; # Exit on first error - python3 /workspace/vllm/examples/offline_inference/neuron.py; - python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; - for f in /workspace/vllm/tests/neuron/2_core/*.py; do - echo \"Running test file: \$f\"; - python3 -m pytest \$f -v --capture=tee-sys; - done - " \ No newline at end of file diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh new file mode 100644 index 000000000000..29c8f5ed5a91 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +# This script build the Ascend NPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Base ubuntu image with basic ascend development libraries and python installed +VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git" +CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg" +TEST_RUN_CONFIG_FILE="vllm_test.cfg" +VLLM_ASCEND_TMP_DIR= +# Get the test run configuration file from the vllm-ascend repository +fetch_vllm_test_cfg() { + VLLM_ASCEND_TMP_DIR=$(mktemp -d) + # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval + cleanup() { + rm -rf "${VLLM_ASCEND_TMP_DIR}" + } + trap cleanup EXIT + + GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}" + if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then + echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2 + exit 1 + fi + + # If the file already exists locally, just overwrite it + cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}" + echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}" + + # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources + # when the trap is abnormal has been completed, so the temporary resources are manually deleted here. + rm -rf "${VLLM_ASCEND_TMP_DIR}" + trap - EXIT +} + +# Downloads test run configuration file from a remote URL. +# Loads the configuration into the current script environment. +get_config() { + if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then + echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 + exit 1 + fi + source "${TEST_RUN_CONFIG_FILE}" + echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" + return 0 +} + +# get test running configuration. +fetch_vllm_test_cfg +get_config +# Check if the function call was successful. If not, exit the script. +if [ $? -ne 0 ]; then + exit 1 +fi + +image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}" +container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + +# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards +agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}') +echo "agent_idx: ${agent_idx}" +builder_name="cachebuilder${agent_idx}" +builder_cache_dir="/mnt/docker-cache${agent_idx}" +mkdir -p ${builder_cache_dir} + +# Try building the docker image +cat <=6.0 modelscope + +WORKDIR /workspace/vllm + +# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. +COPY requirements/common.txt /workspace/vllm/requirements/common.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r requirements/common.txt + +COPY . . + +# Install vLLM +RUN --mount=type=cache,target=/root/.cache/pip \ + VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton + +# Install vllm-ascend +WORKDIR /workspace +ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git +ARG VLLM_ASCEND_TAG=main +RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \ + git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend + +# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r /workspace/vllm-ascend/requirements.txt + +RUN --mount=type=cache,target=/root/.cache/pip \ + export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ + +ENV VLLM_WORKER_MULTIPROC_METHOD=spawn +ENV VLLM_USE_MODELSCOPE=True + +WORKDIR /workspace/vllm-ascend + +CMD ["/bin/bash"] + +EOF + +# Setup cleanup +remove_docker_container() { + docker rm -f "${container_name}" || true; + docker image rm -f "${image_name}" || true; + docker system prune -f || true; +} +trap remove_docker_container EXIT + +# Generate corresponding --device args based on BUILDKITE_AGENT_NAME +# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. +# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. +# returns --device /dev/davinci0 --device /dev/davinci1 +parse_and_gen_devices() { + local input="$1" + local index cards_num + if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then + index="${BASH_REMATCH[1]}" + cards_num="${BASH_REMATCH[2]}" + else + echo "parse error" >&2 + return 1 + fi + + local devices="" + local i=0 + while (( i < cards_num )); do + local dev_idx=$(((index - 1)*cards_num + i )) + devices="$devices --device /dev/davinci${dev_idx}" + ((i++)) + done + + # trim leading space + devices="${devices#"${devices%%[![:space:]]*}"}" + # Output devices: assigned to the caller variable + printf '%s' "$devices" +} + +devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 + +# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. +# This test checks whether the OOT platform interface is functioning properly in conjunction with +# the hardware plugin vllm-ascend. +model_cache_dir=/mnt/modelscope${agent_idx} +mkdir -p ${model_cache_dir} +docker run \ + ${devices} \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v ${model_cache_dir}:/root/.cache/modelscope \ + --entrypoint="" \ + --name "${container_name}" \ + "${image_name}" \ + bash -c ' + set -e + pytest -v -s tests/e2e/vllm_interface/ +' diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh new file mode 100755 index 000000000000..cbb2527a4ff0 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +set -xu + + +remove_docker_container() { + docker rm -f tpu-test || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# Build the docker image. +docker build -f docker/Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker + +# For HF_TOKEN. +source /etc/environment + +docker run --privileged --net host --shm-size=16G -it \ + -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. + +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 +echo "--- Python dependencies installed ---" + +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= + +echo "--- Hardware Information ---" +# tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 1 "test_struct_output_generate.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" +run_and_track_test 2 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" +run_and_track_test 3 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 4 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 5 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" +run_and_track_test 6 "test_kv_cache_update_kernel.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" +run_and_track_test 7 "test_tpu_int8.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? + +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi +# TODO: This test fails because it uses RANDOM_SEED sampling +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 60f0d174bd6c..f022fa3672ee 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -5,7 +5,6 @@ set -xu remove_docker_container() { docker rm -f tpu-test || true; - docker rm -f vllm-tpu || true; } trap remove_docker_container EXIT @@ -62,12 +61,12 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 + && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ + && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" -export VLLM_USE_V1=1 + export VLLM_XLA_CHECK_RECOMPILATION=1 export VLLM_XLA_CACHE_PATH= -echo "Using VLLM V1" echo "--- Hardware Information ---" # tpu-info @@ -149,18 +148,6 @@ run_and_track_test 9 "test_multimodal.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" run_and_track_test 10 "test_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" -run_and_track_test 11 "test_struct_output_generate.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" -run_and_track_test 12 "test_moe_pallas.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" -run_and_track_test 13 "test_lora.py" \ - "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" -run_and_track_test 14 "test_tpu_qkv_linear.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" -run_and_track_test 15 "test_spmd_model_weight_loading.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" -run_and_track_test 16 "test_kv_cache_update_kernel.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 7589b48b584d..250a64fdd071 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -23,12 +23,26 @@ docker run \ --device /dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ + -e "HF_TOKEN=${HF_TOKEN}" \ + -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \ --name "${container_name}" \ "${image_name}" \ - sh -c ' - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + bash -c ' + set -e + echo $ZE_AFFINITY_MASK + pip install tblib==3.1.0 + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager cd tests pytest -v -s v1/core + pytest -v -s v1/engine + pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py + pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py + pytest -v -s v1/structured_output + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py + pytest -v -s v1/test_serial_utils.py ' diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh index 195a8063fd74..51536b36b808 100644 --- a/.buildkite/scripts/run-benchmarks.sh +++ b/.buildkite/scripts/run-benchmarks.sh @@ -11,20 +11,20 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.." (which wget && which curl) || (apt-get update && apt-get install -y wget curl) # run python-based benchmarks and upload the result to buildkite -python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt +vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt bench_latency_exit_code=$? -python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt +vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt bench_throughput_exit_code=$? # run server-based benchmarks and upload the result to buildkite -python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & +vllm serve meta-llama/Llama-2-7b-chat-hf & server_pid=$! wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wait for server to start, timeout after 600 seconds timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 -python3 benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --dataset-name sharegpt \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh new file mode 100755 index 000000000000..5b25c358fc4a --- /dev/null +++ b/.buildkite/scripts/run-prime-rl-test.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Setup script for Prime-RL integration tests +# This script prepares the environment for running Prime-RL tests with nightly vLLM + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" +PRIME_RL_DIR="${REPO_ROOT}/prime-rl" + +echo "Setting up Prime-RL integration test environment..." + +# Clean up any existing Prime-RL directory +if [ -d "${PRIME_RL_DIR}" ]; then + echo "Removing existing Prime-RL directory..." + rm -rf "${PRIME_RL_DIR}" +fi + +# Install UV if not available +if ! command -v uv &> /dev/null; then + echo "Installing UV package manager..." + curl -LsSf https://astral.sh/uv/install.sh | sh + source $HOME/.local/bin/env +fi + +# Clone Prime-RL repository at specific branch for reproducible tests +PRIME_RL_BRANCH="integ-vllm-main" +echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..." +git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}" +cd "${PRIME_RL_DIR}" + +echo "Setting up UV project environment..." +export UV_PROJECT_ENVIRONMENT=/usr/local +ln -s /usr/bin/python3 /usr/local/bin/python + +# Remove vllm pin from pyproject.toml +echo "Removing vllm pin from pyproject.toml..." +sed -i '/vllm==/d' pyproject.toml + +# Sync Prime-RL dependencies +echo "Installing Prime-RL dependencies..." +uv sync --inexact && uv sync --inexact --all-extras + +# Verify installation +echo "Verifying installations..." +uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" +uv run python -c "import prime_rl; print('Prime-RL imported successfully')" + +echo "Prime-RL integration test environment setup complete!" + +echo "Running Prime-RL integration tests..." +export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY +uv run pytest -vs tests/integration/test_rl.py -m gpu + +echo "Prime-RL integration tests completed!" diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh index 209d9c4341cd..740d81fb39bb 100755 --- a/.buildkite/scripts/tpu/cleanup_docker.sh +++ b/.buildkite/scripts/tpu/cleanup_docker.sh @@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f # Remove unused volumes / force the system prune for old images as well. - docker volume prune -f && docker system prune --force --filter "until=72h" --all + docker volume prune -f && docker system prune --force --filter "until=24h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env index 03ec116f698d..c9e3c26571e7 100644 --- a/.buildkite/scripts/tpu/config_v6e_1.env +++ b/.buildkite/scripts/tpu/config_v6e_1.env @@ -1,6 +1,6 @@ # Environment config TEST_NAME=llama8b -CONTAINER_NAME=vllm-tpu +CONTAINER_NAME=tpu-test # vllm config MODEL=meta-llama/Llama-3.1-8B-Instruct diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh index 8959877a3c05..08e36611809d 100755 --- a/.buildkite/scripts/tpu/docker_run_bm.sh +++ b/.buildkite/scripts/tpu/docker_run_bm.sh @@ -12,8 +12,6 @@ source /etc/environment source $ENV_FILE remove_docker_container() { - docker rm -f tpu-test || true; - docker rm -f vllm-tpu || true; docker rm -f $CONTAINER_NAME || true; } diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env index bab34b3be3b9..ecb98d4516bd 100644 --- a/.buildkite/scripts/tpu/quantized_v6e_1.env +++ b/.buildkite/scripts/tpu/quantized_v6e_1.env @@ -1,6 +1,6 @@ # Environment config TEST_NAME=llama8bw8a8 -CONTAINER_NAME=vllm-tpu +CONTAINER_NAME=tpu-test # vllm config MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 @@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024 TENSOR_PARALLEL_SIZE=1 MAX_MODEL_LEN=2048 DOWNLOAD_DIR=/mnt/disks/persist -EXPECTED_THROUGHPUT=10.0 +EXPECTED_THROUGHPUT=8.7 INPUT_LEN=1800 OUTPUT_LEN=128 diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index 877669cd956a..3364fce8e1fd 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -42,9 +42,8 @@ echo "lanching vllm..." echo "logging to $VLLM_LOG" echo -VLLM_USE_V1=1 vllm serve $MODEL \ +vllm serve $MODEL \ --seed 42 \ - --disable-log-requests \ --max-num-seqs $MAX_NUM_SEQS \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ @@ -77,7 +76,7 @@ done echo "run benchmark test..." echo "logging to $BM_LOG" echo -python benchmarks/benchmark_serving.py \ +vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name sonnet \ diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 037897e53dbe..43aa8c47be29 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -14,8 +14,19 @@ fi # Get the single wheel file wheel="${wheel_files[0]}" -# Rename 'linux' to 'manylinux1' in the wheel filename -new_wheel="${wheel/linux/manylinux1}" +# Detect architecture and rename 'linux' to appropriate manylinux version +arch=$(uname -m) +if [[ $arch == "x86_64" ]]; then + manylinux_version="manylinux1" +elif [[ $arch == "aarch64" ]]; then + manylinux_version="manylinux2014" +else + echo "Warning: Unknown architecture $arch, using manylinux1 as default" + manylinux_version="manylinux1" +fi + +# Rename 'linux' to the appropriate manylinux version in the wheel filename +new_wheel="${wheel/linux/$manylinux_version}" mv -- "$wheel" "$new_wheel" wheel="$new_wheel" @@ -47,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel" aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -if [[ $normal_wheel == *"cu118"* ]]; then - # if $normal_wheel matches cu118, do not upload the index.html - echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu126"* ]]; then +if [[ $normal_wheel == *"cu126"* ]]; then # if $normal_wheel matches cu126, do not upload the index.html echo "Skipping index files for cu126 wheels" +elif [[ $normal_wheel == *"cu128"* ]]; then + # if $normal_wheel matches cu128, do not upload the index.html + echo "Skipping index files for cu128 wheels" else - # only upload index.html for cu128 wheels (default wheels) + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" fi @@ -63,14 +75,15 @@ fi aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" -if [[ $normal_wheel == *"cu118"* ]]; then - # if $normal_wheel matches cu118, do not upload the index.html - echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu126"* ]]; then +if [[ $normal_wheel == *"cu126"* ]]; then # if $normal_wheel matches cu126, do not upload the index.html echo "Skipping index files for cu126 wheels" +elif [[ $normal_wheel == *"cu128"* ]]; then + # if $normal_wheel matches cu128, do not upload the index.html + echo "Skipping index files for cu128 wheels" else - # only upload index.html for cu128 wheels (default wheels) + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" fi diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml new file mode 100644 index 000000000000..50b2b61124af --- /dev/null +++ b/.buildkite/test-amd.yaml @@ -0,0 +1,1267 @@ +# In this file, you can add more tests to run either by adding a new step or +# adding a new command to an existing step. See different options here for examples. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + +# Documentation +# label(str): the name of the test. emojis allowed. +# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. +# fast_check_only(bool): run this test on the fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. +# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). +# command(str): the single command to run for tests. incompatible with commands. +# commands(list): the list of commands to run for the test. incompatible with command. +# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] +# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 +# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. +# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, +# in this case, commands must be specified. the first command runs on the first host, the second +# command runs on the second host. +# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. +# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB +# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. +# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. + +# When adding a test +# - If the test belongs to an existing group, add it there +# - If the test is short, add to any existing step +# - If the test takes more than 10min, then it is okay to create a new step. +# Note that all steps execute in parallel. + +steps: +##### fast check tests ##### + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/generate_nightly_torch_test.py + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Async Engine, Inputs, Utils, Worker Test # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Entrypoints Unit Tests # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration Test (API Server) # 100min + timeout_in_minutes: 130 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/test_basic_correctness + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + +- label: EPLB Algorithm Test # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +##### fast check tests ##### +##### 1 GPU test ##### + +- label: Regression Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Engine Test # 25min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + #grade: Blocking + source_file_dependencies: + - vllm/ + - tests/engine + - tests/tokenization + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + # OOM in the CI unless we run this separately + - pytest -v -s tokenization + +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Test others (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Samplers Test # 56min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py + parallelism: 4 + +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_pass_manager.py + - pytest -v -s compile/test_fusion.py + - pytest -v -s compile/test_fusion_attn.py + - pytest -v -s compile/test_functionalization.py + - pytest -v -s compile/test_silu_mul_quant_fusion.py + - pytest -v -s compile/test_sequence_parallelism.py + - pytest -v -s compile/test_async_tp.py + - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py + - pytest -v -s compile/test_noop_elimination.py + - pytest -v -s compile/test_aot_compile.py + +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/piecewise/ + +- label: PyTorch Fullgraph Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_full_graph.py + +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - tests/kernels/core + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ + +- label: Quantization Test # 70min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ + +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: OpenAI API correctness # 22min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ + +- label: OpenAI-Compatible Tool Use # 23 min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + fast_check: false + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use + +##### models test ##### + +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 80 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Models Test (Extended) 1 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models Test (Extended) 2 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models Test + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* + +- label: Transformers Nightly Models Test + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Blackwell Test # 38 min + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/fusion.py + - vllm/compilation/fusion_attn.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + # Fusion + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Blackwell Quantized MoE Test + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + +##### 1 GPU test ##### +##### multi gpus test ##### + +- label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + +- label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins + +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + + +- label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + + +##### multi gpus test ##### +##### A100 test ##### + +- label: Distributed Tests (A100) # optional + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H200 test ##### +- label: Distrubted Tests (H200) # optional + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c476f71c6637..e037f88f0d31 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -6,24 +6,28 @@ # to generate the final pipeline yaml file. # Documentation -# label(str): the name of the test. emoji allowed. -# fast_check(bool): whether to run this on each commit on fastcheck pipeline. -# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline. -# fast_check_only(bool): run this test on fastcheck pipeline only -# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. +# label(str): the name of the test. emojis allowed. +# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. +# fast_check_only(bool): run this test on the fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. +# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). # command(str): the single command to run for tests. incompatible with commands. -# commands(list): the list of commands to run for test. incompatbile with command. -# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] -# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100 -# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4. -# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, -# in this case, commands must be specified. the first command runs on first host, the second +# commands(list): the list of commands to run for the test. incompatible with command. +# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] +# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 +# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. +# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, +# in this case, commands must be specified. the first command runs on the first host, the second # command runs on the second host. -# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests -# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run. +# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. +# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB +# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. +# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. # When adding a test -# - If the test belong to an existing group, add it there +# - If the test belongs to an existing group, add it there # - If the test is short, add to any existing step # - If the test takes more than 10min, then it is okay to create a new step. # Note that all steps execute in parallel. @@ -31,16 +35,6 @@ steps: ##### fast check tests ##### -- label: Documentation Build # 2min - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/test_docs" - fast_check: true - no_gpu: True - commands: - - pip install -r ../requirements/docs.txt - # TODO: add `--strict` once warnings in docstrings are fixed - - mkdocs build - - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist @@ -51,29 +45,36 @@ steps: commands: - bash standalone_tests/pytorch_nightly_dependency.sh -- label: Async Engine, Inputs, Utils, Worker Test # 24min +- label: Async Engine, Inputs, Utils, Worker Test # 36min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - - tests/mq_llm_engine - - tests/async_engine - - tests/test_inputs - tests/multimodal - - tests/test_utils - - tests/worker + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils + no_gpu: true commands: - python3 standalone_tests/lazy_imports.py - - pytest -v -s mq_llm_engine # MQLLMEngine - - pytest -v -s async_engine # AsyncLLMEngine - - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - - pytest -v -s multimodal - - pytest -v -s test_utils.py # Utils - - pytest -v -s worker # Worker + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils -- label: Python-only Installation Test +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - tests/standalone_tests/python_only_compile.sh @@ -81,43 +82,35 @@ steps: commands: - bash standalone_tests/python_only_compile.sh -- label: Basic Correctness Test # 30min - mirror_hardwares: [amdexperimental, amdproduction] +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] fast_check: true torch_nightly: true source_file_dependencies: - vllm/ - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_preemption - tests/basic_correctness/test_cumem.py commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - -- label: Chunked Prefill Test - mirror_hardwares: [amdexperimental, amdproduction] - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_chunked_prefill - commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py -- label: Core Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] +- label: Entrypoints Unit Tests # 5min + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" fast_check: true source_file_dependencies: - - vllm/core - - vllm/distributed - - tests/core + - vllm/entrypoints + - tests/entrypoints/ commands: - - pytest -v -s core + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling -- label: Entrypoints Test (LLM) # 40min +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true @@ -128,14 +121,12 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Test (API Server) # 40min +- label: Entrypoints Integration Test (API Server) # 100min + timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true @@ -146,16 +137,30 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py -- label: Distributed Tests (4 GPUs) # 10min +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: - vllm/distributed/ - - vllm/core/ - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events @@ -163,24 +168,34 @@ steps: - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - - tests/v1/test_async_llm_dp.py - - tests/v1/test_external_lb_dp.py + - tests/v1/distributed - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py commands: - # test with tp=2 and external_dp=2 - - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with tp=2 and pp=2 + # test with torchrun tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - pushd ../examples/offline_inference @@ -188,7 +203,8 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd -- label: EPLB Algorithm Test +- label: EPLB Algorithm Test # 5min + timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/eplb @@ -197,6 +213,7 @@ steps: - pytest -v -s distributed/test_eplb_algo.py - label: EPLB Execution Test # 5min + timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -205,26 +222,26 @@ steps: commands: - pytest -v -s distributed/test_eplb_execute.py -- label: Metrics, Tracing Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] num_gpus: 2 source_file_dependencies: - vllm/ - - tests/metrics - - tests/tracing + - tests/v1/tracing commands: - - pytest -v -s metrics - "pip install \ 'opentelemetry-sdk>=1.26.0' \ 'opentelemetry-api>=1.26.0' \ 'opentelemetry-exporter-otlp>=1.26.0' \ 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s tracing + - pytest -v -s v1/tracing ##### fast check tests ##### ##### 1 GPU test ##### -- label: Regression Test # 5min +- label: Regression Test # 7min + timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -234,7 +251,8 @@ steps: - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional -- label: Engine Test # 10min +- label: Engine Test # 25min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -249,34 +267,66 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: V1 Test +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/v1 commands: - # split the test to avoid interference - - pytest -v -s v1/core + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e - pytest -v -s v1/engine + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - - pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode - - pytest -v -s v1/kv_connector/unit - - pytest -v -s v1/metrics - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s v1/test_utils.py + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_metrics_reader.py - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e + - pytest -v -s v1/test_request.py # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -- label: Examples Test # 25min +- label: V1 Test others (CPU) # 5 mins + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/examples" source_file_dependencies: @@ -293,24 +343,16 @@ steps: - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - -- label: Prefix Caching Test # 9min - mirror_hardwares: [amdexperimental, amdproduction] - source_file_dependencies: - - vllm/ - - tests/prefix_caching - commands: - - pytest -v -s prefix_caching - + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 -- label: Platform Tests (CUDA) +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ @@ -318,7 +360,8 @@ steps: commands: - pytest -v -s cuda/test_cuda_context.py -- label: Samplers Test # 36min +- label: Samplers Test # 56min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers @@ -329,15 +372,28 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: LoRA Test %N # 15min each - mirror_hardwares: [amdexperimental, amdproduction] +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss.py \ + --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 -- label: PyTorch Compilation Unit Tests +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -347,42 +403,48 @@ steps: - pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion_attn.py + - pytest -v -s compile/test_functionalization.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/test_sequence_parallelism.py - - pytest -v -s compile/test_async_tp.py + - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py + - pytest -v -s compile/test_noop_elimination.py + - pytest -v -s compile/test_aot_compile.py -- label: PyTorch Fullgraph Smoke Test # 9min - mirror_hardwares: [amdexperimental, amdproduction] +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py - # these tests need to be separated, cannot combine - - pytest -v -s compile/piecewise/test_simple.py - - pytest -v -s compile/piecewise/test_toy_llama.py - - pytest -v -s compile/piecewise/test_full_cudagraph.py + - pytest -v -s compile/piecewise/ -- label: PyTorch Fullgraph Test # 18min - mirror_hardwares: [amdexperimental, amdproduction] +- label: PyTorch Fullgraph Test # 22min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_full_graph.py + - pytest -v -s compile/test_fusions_e2e.py -- label: Kernels Core Operation Test - mirror_hardwares: [amdexperimental, amdproduction] +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ - tests/kernels/core + - tests/kernels/test_top_k_per_row.py commands: - - pytest -v -s kernels/core + - pytest -v -s kernels/core kernels/test_top_k_per_row.py -- label: Kernels Attention Test %N - mirror_hardwares: [amdexperimental, amdproduction] +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ - vllm/attention @@ -392,74 +454,73 @@ steps: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels Quantization Test %N - mirror_hardwares: [amdexperimental, amdproduction] +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/ - vllm/model_executor/layers/quantization - tests/kernels/quantization commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 -- label: Kernels MoE Test +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ - csrc/moe/ - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ commands: - - pytest -v -s kernels/moe + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 -- label: Kernels Mamba Test +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops commands: - pytest -v -s kernels/mamba -- label: Tensorizer Test # 11min +- label: Model Executor Test # 23min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] - soft_fail: true - source_file_dependencies: - - vllm/model_executor/model_loader - - tests/tensorizer_loader - - tests/entrypoints/openai/test_tensorizer_entrypoint.py - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s tensorizer_loader - - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - -- label: Model Executor Test - mirror_hardwares: [amdexperimental, amdproduction] - soft_fail: true source_file_dependencies: - vllm/model_executor - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py -- label: Benchmarks # 9min - mirror_hardwares: [amdexperimental, amdproduction] +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: - benchmarks/ commands: - bash scripts/run-benchmarks.sh -- label: Benchmarks CLI Test # 10min - mirror_hardwares: [amdexperimental, amdproduction] +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/benchmarks/ commands: - pytest -v -s benchmarks/ -- label: Quantization Test +- label: Quantization Test # 70min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -467,21 +528,26 @@ steps: - tests/quantization commands: # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release - - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: LM Eval Small Models # 53min + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 -- label: OpenAI API correctness +- label: OpenAI API correctness # 22min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ @@ -490,64 +556,128 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: Encoder Decoder tests # 5min +- label: OpenAI-Compatible Tool Use # 23 min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] + fast_check: false source_file_dependencies: - - vllm/ - - tests/encoder_decoder + - vllm/ + - tests/tool_use commands: - - pytest -v -s encoder_decoder + - pytest -v -s -m 'not cpu_test' tool_use -- label: OpenAI-Compatible Tool Use # 20 min - mirror_hardwares: [amdexperimental] - fast_check: false +- label: OpenAI-Compatible Tool Use (CPU) # 5 mins + timeout_in_minutes: 10 source_file_dependencies: - vllm/ - tests/tool_use - - tests/mistral_tool_use + no_gpu: true commands: - - pytest -v -s tool_use - - pytest -v -s mistral_tool_use + - pytest -v -s -m 'cpu_test' tool_use ##### models test ##### -- label: Basic Models Test # 24min +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - - tests/models + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py commands: - - pytest -v -s models/test_transformers.py - - pytest -v -s models/test_registry.py - - pytest -v -s models/test_utils.py - - pytest -v -s models/test_vision.py - - pytest -v -s models/test_initialization.py + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 -- label: Language Models Test (Standard) +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/models/language commands: - # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m core_model + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 -- label: Language Models Test (Hybrid) # 35 min +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/models/language/generation commands: - # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - - pytest -v -s models/language/generation -m hybrid_model + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 -- label: Language Models Test (Extended Generation) # 1hr20min +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: @@ -558,7 +688,18 @@ steps: - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + - label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: @@ -567,7 +708,27 @@ steps: commands: - pytest -v -s models/language/pooling -m 'not core_model' -- label: Multi-Modal Models Test (Standard) +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 80 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: @@ -576,9 +737,18 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal/processing - - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model - - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] @@ -588,7 +758,7 @@ steps: - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - label: Multi-Modal Models Test (Extended) 2 mirror_hardwares: [amdexperimental] @@ -610,8 +780,9 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' -- label: Quantized Models Test - mirror_hardwares: [amdexperimental, amdproduction] +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers/quantization - tests/models/quantization @@ -620,7 +791,7 @@ steps: # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] optional: true commands: - echo 'Testing custom models...' @@ -634,17 +805,118 @@ steps: commands: - pip install --upgrade git+https://github.com/huggingface/transformers - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/audio_language.py --model-type whisper - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Blackwell Test # 21 min + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + +- label: Blackwell Fusion Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusions_e2e.py + +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Blackwell Quantized MoE Test + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 ##### 1 GPU test ##### ##### multi gpus test ##### - label: Distributed Comm Ops Test # 7min - mirror_hardwares: [amdexperimental, amdproduction] + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: @@ -653,8 +925,11 @@ steps: commands: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py - label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -678,47 +953,61 @@ steps: - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code -- label: Distributed Tests (2 GPUs) # 40min +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: + - vllm/compilation/ - vllm/distributed/ - vllm/engine/ - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - - vllm/compilation - vllm/worker/worker_base.py - - vllm/worker/worker.py - - vllm/worker/model_runner.py - - entrypoints/llm/test_collective_rpc.py - - tests/v1/test_async_llm_dp.py - - tests/v1/test_external_lb_dp.py - - tests/v1/entrypoints/openai/test_multi_api_servers.py - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py commands: - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' - # test sequence parallel - - pytest -v -s distributed/test_sequence_parallel.py - # this test fails consistently. - # TODO: investigate and fix - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' - label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -731,6 +1020,16 @@ steps: - pytest -v -s plugins_tests/test_platform_plugins.py - pip uninstall vllm_add_dummy_platform -y # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test # other tests continue here: - pytest -v -s plugins_tests/test_scheduler_plugins.py - pip install -e ./plugins/vllm_add_dummy_model @@ -739,29 +1038,9 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins -- label: Multi-step Tests (4 GPUs) # 36min - mirror_hardwares: [amdexperimental, amdproduction] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/model_executor/layers/sampler.py - - vllm/sequence.py - - vllm/worker/worker_base.py - - vllm/worker/worker.py - - vllm/worker/multi_step_worker.py - - vllm/worker/model_runner_base.py - - vllm/worker/model_runner.py - - vllm/worker/multi_step_model_runner.py - - vllm/engine - - tests/multi_step - commands: - # this test is quite flaky - # TODO: investigate and fix. - # - pytest -v -s multi_step/test_correctness_async_llm.py - - pytest -v -s multi_step/test_correctness_llm.py - -- label: Pipeline Parallelism Test # 45min - mirror_hardwares: [amdexperimental, amdproduction] +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -774,8 +1053,9 @@ steps: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py -- label: LoRA TP Test (Distributed) - mirror_hardwares: [amdexperimental, amdproduction] +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] num_gpus: 4 source_file_dependencies: - vllm/lora @@ -788,12 +1068,16 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py - label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 + optional: true source_file_dependencies: - vllm/ - tests/weight_loading @@ -811,6 +1095,17 @@ steps: - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + +- label: NixlConnector PD accuracy tests (Distributed) # 30min + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh ##### multi gpus test ##### @@ -841,3 +1136,39 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H200 test ##### +- label: Distributed Tests (H200) # optional + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/compile/test_async_tp.py + - pytest -v -s tests/compile/test_sequence_parallelism.py + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000000..b7a9fdb4e05a --- /dev/null +++ b/.coveragerc @@ -0,0 +1,47 @@ +[run] +# Track the installed vllm package (this is what actually gets imported during tests) +# Use wildcard pattern to match the installed location +source = + vllm + */dist-packages/vllm + */site-packages/vllm +omit = + */tests/* + */test_* + */__pycache__/* + */build/* + */dist/* + */vllm.egg-info/* + */third_party/* + */examples/* + */benchmarks/* + */docs/* + +[paths] +# Map all possible vllm locations to a canonical "vllm" path +# This ensures coverage.combine properly merges data from different test runs +source = + vllm + /vllm-workspace/src/vllm + /vllm-workspace/vllm + */site-packages/vllm + */dist-packages/vllm + +[report] +exclude_lines = + pragma: no cover + def __repr__ + if self.debug: + if settings.DEBUG + raise AssertionError + raise NotImplementedError + if 0: + if __name__ == .__main__.: + class .*\bProtocol\): + @(abc\.)?abstractmethod + +[html] +directory = htmlcov + +[xml] +output = coverage.xml diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000000..5a601d00cef8 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,4 @@ +# Migrate from `yapf` & `isort` to `ruff` +d6953beb91da4e9c99be4c0a1304a2d24189535c +# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y` +8fcaaf6a165e661f63fc51be906bc05b0767332f diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml new file mode 100644 index 000000000000..443dfa45af22 --- /dev/null +++ b/.github/.bc-linter.yml @@ -0,0 +1,24 @@ +# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md +version: 1 +paths: +# We temporarily disable globally, and will only enable with `annotations.include` +# include: +# - "vllm/v1/attetion/*.py" +# - "vllm/v1/core/*.py" +exclude: + - "**/*.py" + +scan: + functions: true # check free functions and methods + classes: true # check classes/dataclasses + public_only: true # ignore names starting with "_" at any level + +annotations: + include: # decorators that force‑include a symbol + - name: "bc_linter_include" # matched by simple name or dotted suffix + propagate_to_members: false # for classes, include methods/inner classes + exclude: # decorators that force‑exclude a symbol + - name: "bc_linter_skip" # matched by simple name or dotted suffix + propagate_to_members: true # for classes, exclude methods/inner classes + +excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"] diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8c68bc8f02b6..14301fe8d847 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,53 +2,127 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review +/vllm/attention @LucasWilkinson /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth -/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm -/vllm/multimodal @DarkLight1337 @ywang96 +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn +/vllm/model_executor/layers/fused_moe @mgoin +/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 +/vllm/model_executor/layers/mamba @tdoublep +/vllm/model_executor/model_loader @22quinn +/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee -/vllm/reasoning @aarnphm -/vllm/entrypoints @aarnphm +/vllm/reasoning @aarnphm @chaunceyjiang +/vllm/entrypoints @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg +/vllm/distributed/kv_transfer @NickLucche @ApostaC CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people -/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor +/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg +/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 # vLLM V1 -/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat -/vllm/v1/structured_output @mgoin @russellb @aarnphm +/vllm/v1/attention @LucasWilkinson +/vllm/v1/attention/backends/flashinfer.py @mgoin +/vllm/v1/attention/backends/triton_attn.py @tdoublep +/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC +/vllm/v1/sample @22quinn @houseroad @njhill +/vllm/v1/spec_decode @benchislett @luccafong +/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett +/vllm/v1/kv_cache_interface.py @heheda12345 +/vllm/v1/offloading @ApostaC # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo -/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo -/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm -/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm -/tests/kernels @tlrmchlsmth @WoosukKwon -/tests/model_executor/test_guided_processors.py @mgoin @russellb +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche +/tests/evals @mgoin +/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 -/tests/multi_step @alexm-redhat @comaniac -/tests/multimodal @DarkLight1337 @ywang96 -/tests/prefix_caching @comaniac @KuntaiDu -/tests/quantization @mgoin @robertgshaw2-redhat +/tests/multimodal @DarkLight1337 @ywang96 @NickLucche +/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm -/tests/weight_loading @mgoin @youkaichao +/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC +/tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee +/tests/models/language/generation/test_hybrid.py @tdoublep +/tests/v1/kv_connector/nixl_integration @NickLucche +/tests/v1/kv_connector @ApostaC +/tests/v1/offloading @ApostaC + +# Transformers backend +/vllm/model_executor/models/transformers @hmellor +/tests/models/test_transformers.py @hmellor # Docs -/docs @hmellor +/docs/mkdocs @hmellor +/docs/**/*.yml @hmellor +/requirements/docs.txt @hmellor +.readthedocs.yaml @hmellor mkdocs.yaml @hmellor + +# Linting +.markdownlint.yaml @hmellor +.pre-commit-config.yaml @hmellor +/tools/pre_commit @hmellor + +# CPU +/vllm/v1/worker/cpu* @bigPYJ1151 +/csrc/cpu @bigPYJ1151 +/vllm/platforms/cpu.py @bigPYJ1151 +/cmake/cpu_extension.cmake @bigPYJ1151 +/docker/Dockerfile.cpu @bigPYJ1151 + +# Intel GPU +/vllm/v1/worker/xpu* @jikunshang +/vllm/platforms/xpu.py @jikunshang +/docker/Dockerfile.xpu @jikunshang + +# Qwen-specific files +/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow +/vllm/model_executor/models/qwen* @sighingnow + +# MTP-specific files +/vllm/model_executor/models/deepseek_mtp.py @luccafong + +# Mistral-specific files +/vllm/model_executor/models/mistral*.py @patrickvonplaten +/vllm/model_executor/models/mixtral*.py @patrickvonplaten +/vllm/model_executor/models/voxtral*.py @patrickvonplaten +/vllm/model_executor/models/pixtral*.py @patrickvonplaten +/vllm/transformers_utils/configs/mistral.py @patrickvonplaten +/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten + +# Kernels +/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep +/vllm/attention/ops/triton_unified_attention.py @tdoublep + +# ROCm related: specify owner with write access to notify AMD folks for careful code review +/docker/Dockerfile.rocm* @gshtras +/vllm/v1/attention/backends/rocm*.py @gshtras +/vllm/v1/attention/backends/mla/rocm*.py @gshtras +/vllm/attention/ops/rocm*.py @gshtras +/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras + +# TPU +/vllm/v1/worker/tpu* @NickLucche +/vllm/platforms/tpu.py @NickLucche +/vllm/v1/sample/tpu @NickLucche +/vllm/tests/v1/tpu @NickLucche + +# KVConnector installation files +/requirements/kv_connectors.txt @NickLucche + +# Pooling models +/examples/*/pooling/ @noooop +/tests/models/*/pooling* @noooop +/tests/entrypoints/pooling @noooop +/vllm/config/pooler.py @noooop +/vllm/pooling_params.py @noooop +/vllm/model_executor/layers/pooler.py @noooop diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index 7ee57c42895c..c0e009855964 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -43,10 +43,6 @@ body: Any other things you would like to mention. validations: required: false -- type: markdown - attributes: - value: > - Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit). - type: checkboxes id: askllm attributes: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 017ec7ca82da..8043df65d558 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,10 +1,5 @@ -## Essential Elements of an Effective PR Description Checklist -- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". -- [ ] The test plan, such as providing test command. -- [ ] The test results, such as pasting the results comparison before and after, or e2e results -- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. - -PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. + +PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED. ## Purpose @@ -12,7 +7,15 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B ## Test Result -## (Optional) Documentation Update +--- +
+ Essential Elements of an Effective PR Description Checklist + +- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". +- [ ] The test plan, such as providing test command. +- [ ] The test results, such as pasting the results comparison before and after, or e2e results +- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. +- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0). +
- **BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.github/mergify.yml b/.github/mergify.yml index 5c878ac02069..de1a8314a4ec 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -2,6 +2,7 @@ pull_request_rules: - name: label-documentation description: Automatically apply documentation label conditions: + - label != stale - or: - files~=^[^/]+\.md$ - files~=^docs/ @@ -10,10 +11,13 @@ pull_request_rules: label: add: - documentation + comment: + message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/" - name: label-ci-build description: Automatically apply ci/build label conditions: + - label != stale - or: - files~=^\.github/ - files~=\.buildkite/ @@ -30,6 +34,7 @@ pull_request_rules: - name: label-deepseek description: Automatically apply deepseek label conditions: + - label != stale - or: - files~=^examples/.*deepseek.*\.py - files~=^tests/.*deepseek.*\.py @@ -46,6 +51,7 @@ pull_request_rules: - name: label-frontend description: Automatically apply frontend label conditions: + - label != stale - files~=^vllm/entrypoints/ actions: label: @@ -55,6 +61,7 @@ pull_request_rules: - name: label-llama description: Automatically apply llama label conditions: + - label != stale - or: - files~=^examples/.*llama.*\.py - files~=^tests/.*llama.*\.py @@ -70,6 +77,7 @@ pull_request_rules: - name: label-multi-modality description: Automatically apply multi-modality label conditions: + - label != stale - or: - files~=^vllm/multimodal/ - files~=^tests/multimodal/ @@ -83,6 +91,7 @@ pull_request_rules: - name: label-new-model description: Automatically apply new-model label conditions: + - label != stale - and: - files~=^vllm/model_executor/models/ - files=vllm/model_executor/models/registry.py @@ -94,6 +103,7 @@ pull_request_rules: - name: label-performance description: Automatically apply performance label conditions: + - label != stale - or: - files~=^benchmarks/ - files~=^vllm/benchmarks/ @@ -107,6 +117,7 @@ pull_request_rules: - name: label-qwen description: Automatically apply qwen label conditions: + - label != stale - or: - files~=^examples/.*qwen.*\.py - files~=^tests/.*qwen.*\.py @@ -118,9 +129,32 @@ pull_request_rules: add: - qwen +- name: label-gpt-oss + description: Automatically apply gpt-oss label + conditions: + - label != stale + - or: + - files~=^examples/.*gpt[-_]?oss.*\.py + - files~=^tests/.*gpt[-_]?oss.*\.py + - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py + - files~=^tests/entrypoints/test_context.py + - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py + - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py + - files~=^vllm/entrypoints/harmony_utils.py + - files~=^vllm/entrypoints/tool_server.py + - files~=^vllm/entrypoints/tool.py + - files~=^vllm/entrypoints/context.py + - title~=(?i)gpt[-_]?oss + - title~=(?i)harmony + actions: + label: + add: + - gpt-oss + - name: label-rocm description: Automatically apply rocm label conditions: + - label != stale - or: - files~=^csrc/rocm/ - files~=^docker/Dockerfile.rocm @@ -141,6 +175,7 @@ pull_request_rules: - name: label-structured-output description: Automatically apply structured-output label conditions: + - label != stale - or: - files~=^benchmarks/structured_schemas/ - files=benchmarks/benchmark_serving_structured_output.py @@ -149,11 +184,8 @@ pull_request_rules: - files=examples/offline_inference/structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - - files~=^vllm/model_executor/guided_decoding/ - - files=tests/model_executor/test_guided_processors.py - - files=tests/entrypoints/llm/test_guided_generate.py - files~=^tests/v1/structured_output/ - - files=tests/v1/entrypoints/llm/test_guided_generate.py + - files=tests/v1/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ actions: label: @@ -163,6 +195,7 @@ pull_request_rules: - name: label-speculative-decoding description: Automatically apply speculative-decoding label conditions: + - label != stale - or: - files~=^vllm/v1/spec_decode/ - files~=^tests/v1/spec_decode/ @@ -178,6 +211,7 @@ pull_request_rules: - name: label-v1 description: Automatically apply v1 label conditions: + - label != stale - or: - files~=^vllm/v1/ - files~=^tests/v1/ @@ -190,6 +224,7 @@ pull_request_rules: description: Automatically apply tpu label # Keep this list in sync with `label-tpu-remove` conditions conditions: + - label != stale - or: - files~=tpu.py - files~=_tpu @@ -205,6 +240,7 @@ pull_request_rules: description: Automatically remove tpu label # Keep this list in sync with `label-tpu` conditions conditions: + - label != stale - and: - -files~=tpu.py - -files~=_tpu @@ -219,9 +255,9 @@ pull_request_rules: - name: label-tool-calling description: Automatically add tool-calling label conditions: + - label != stale - or: - files~=^tests/tool_use/ - - files~=^tests/mistral_tool_use/ - files~=^tests/entrypoints/openai/tool_parsers/ - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files~=^vllm/entrypoints/openai/tool_parsers/ @@ -238,8 +274,9 @@ pull_request_rules: - name: ping author on conflicts and add 'needs-rebase' label conditions: - - conflict - - -closed + - label != stale + - conflict + - -closed actions: label: add: @@ -253,20 +290,55 @@ pull_request_rules: - name: assign reviewer for tensorizer changes conditions: + - label != stale + - or: - files~=^vllm/model_executor/model_loader/tensorizer.py - files~=^vllm/model_executor/model_loader/tensorizer_loader.py - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py - - files~=^tests/tensorizer_loader/ + - files~=^tests/model_executor/model_loader/tensorizer_loader/ actions: assign: users: - "sangstar" +- name: assign reviewer for modelopt changes + conditions: + - label != stale + - or: + - files~=^vllm/model_executor/layers/quantization/modelopt\.py$ + - files~=^vllm/model_executor/layers/quantization/__init__\.py$ + - files~=^tests/models/quantization/test_modelopt\.py$ + - files~=^tests/quantization/test_modelopt\.py$ + - files~=^tests/models/quantization/test_nvfp4\.py$ + - files~=^docs/features/quantization/modelopt\.md$ + actions: + assign: + users: + - "Edwardf0t1" + - name: remove 'needs-rebase' label when conflict is resolved conditions: - - -conflict - - -closed + - -conflict + - -closed actions: label: remove: - needs-rebase + +- name: label-kv-connector + description: Automatically apply kv-connector label + conditions: + - label != stale + - or: + - files~=^examples/online_serving/disaggregated[^/]*/.* + - files~=^examples/offline_inference/disaggregated[^/]*/.* + - files~=^examples/others/lmcache/ + - files~=^tests/v1/kv_connector/ + - files~=^vllm/distributed/kv_transfer/ + - title~=(?i)\bP/?D\b + - title~=(?i)NIXL + - title~=(?i)LMCache + actions: + label: + add: + - kv-connector \ No newline at end of file diff --git a/.github/scale-config.yml b/.github/scale-config.yml new file mode 100644 index 000000000000..c41a3ee3eb19 --- /dev/null +++ b/.github/scale-config.yml @@ -0,0 +1,21 @@ +# scale-config.yml: +# Powers what instance types are available for GHA auto-scaled +# runners. Runners listed here will be available as self hosted +# runners, configuration is directly pulled from the main branch. +# runner_types: +# runner_label: +# instance_type: m4.large +# os: linux +# # min_available defaults to the global cfg in the ALI Terraform +# min_available: undefined +# # when max_available value is not defined, no max runners is enforced +# max_available: undefined +# disk_size: 50 +# is_ephemeral: true + +runner_types: + linux.2xlarge: + disk_size: 150 + instance_type: c5.2xlarge + is_ephemeral: true + os: linux diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh index 8d65936fba1d..25af344aab2b 100755 --- a/.github/scripts/cleanup_pr_body.sh +++ b/.github/scripts/cleanup_pr_body.sh @@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}" cp "${OLD}" "${NEW}" -# Remove "FIX #xxxx (*link existing issues this PR will resolve*)" -sed -i '/FIX #xxxx.*$/d' "${NEW}" +# Remove markdown comments (like the at the start) +sed -i '/$/d' "${NEW}" -# Remove "FILL IN THE PR DESCRIPTION HERE" -sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}" +# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED." +sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}" # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index 315042fbf5cf..d8bbedef3174 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml new file mode 100644 index 000000000000..823695a92132 --- /dev/null +++ b/.github/workflows/bc-lint.yml @@ -0,0 +1,29 @@ +name: BC Lint + +on: + pull_request: + types: + - opened + - synchronize + - reopened + - labeled + - unlabeled + +jobs: + bc_lint: + if: github.repository_owner == 'vllm-project' + runs-on: ubuntu-latest + steps: + - name: Run BC Lint Action + uses: pytorch/test-infra/.github/actions/bc-lint@main + with: + repo: ${{ github.event.pull_request.head.repo.full_name }} + base_sha: ${{ github.event.pull_request.base.sha }} + head_sha: ${{ github.event.pull_request.head.sha }} + suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }} + docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter' + config_dir: .github + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index d5c6b8d43a6e..c3e132a536a4 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: '3.12' diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml new file mode 100644 index 000000000000..7d565ef9f2e4 --- /dev/null +++ b/.github/workflows/issue_autolabel.yml @@ -0,0 +1,361 @@ +name: Label issues based on keywords +on: + issues: + types: [opened, edited, reopened] +permissions: + issues: write # needed so the workflow can add labels + contents: read +concurrency: + group: issue-labeler-${{ github.event.issue.number }} + cancel-in-progress: true +jobs: + add-labels: + runs-on: ubuntu-latest + steps: + - name: Label issues based on keywords + id: label-step + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + with: + script: | + // Configuration: Add new labels and keywords here + const labelConfig = { + rocm: { + // Keyword search - matches whole words only (with word boundaries) + keywords: [ + { + term: "composable kernel", + searchIn: "both" + }, + { + term: "rccl", + searchIn: "body" // only search in body + }, + { + term: "migraphx", + searchIn: "title" // only search in title + }, + { + term: "hipgraph", + searchIn: "both" + }, + { + term: "ROCm System Management Interface", + searchIn: "body" + }, + ], + // Substring search - matches anywhere in text (partial matches) + substrings: [ + { + term: "VLLM_ROCM_", + searchIn: "both" + }, + { + term: "aiter", + searchIn: "title" + }, + { + term: "rocm", + searchIn: "title" + }, + { + term: "amd", + searchIn: "title" + }, + { + term: "hip-", + searchIn: "both" + }, + { + term: "gfx", + searchIn: "both" + }, + { + term: "cdna", + searchIn: "both" + }, + { + term: "rdna", + searchIn: "both" + }, + { + term: "torch_hip", + searchIn: "body" // only in body + }, + { + term: "_hip", + searchIn: "both" + }, + { + term: "hip_", + searchIn: "both" + }, + // ROCm tools and libraries + { + term: "hipify", + searchIn: "both" + }, + ], + // Regex patterns - for complex pattern matching + regexPatterns: [ + { + pattern: "\\bmi\\d{3}[a-z]*\\b", + description: "AMD GPU names (mi + 3 digits + optional letters)", + flags: "gi", + searchIn: "both" // "title", "body", or "both" + } + ], + }, + // Add more label configurations here as needed + // example: { + // keywords: [...], + // substrings: [...], + // regexPatterns: [...] + // }, + }; + // Helper function to create regex based on search type + function createSearchRegex(term, type) { + // Escape special regex characters in the term + const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + switch (type) { + case 'keyword': + // Word boundary search - matches whole words only + return new RegExp(`\\b${escapedTerm}\\b`, "gi"); + case 'substring': + // Substring search - matches anywhere in the text + return new RegExp(escapedTerm, "gi"); + default: + throw new Error(`Unknown search type: ${type}`); + } + } + // Helper function to find matching terms in text with line information + function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { + const matches = []; + const lines = text.split('\n'); + for (const termConfig of searchTerms) { + let regex; + let term, searchIn, pattern, description, flags; + // Handle different input formats (string or object) + if (typeof termConfig === 'string') { + term = termConfig; + searchIn = 'both'; // default + } else { + term = termConfig.term; + searchIn = termConfig.searchIn || 'both'; + pattern = termConfig.pattern; + description = termConfig.description; + flags = termConfig.flags; + } + // Skip if this term shouldn't be searched in the current location + if (searchIn !== 'both' && searchIn !== searchLocation) { + continue; + } + // Create appropriate regex + if (searchType === 'regex') { + regex = new RegExp(pattern, flags || "gi"); + } else { + regex = createSearchRegex(term, searchType); + } + const termMatches = []; + // Check each line for matches + lines.forEach((line, lineIndex) => { + const lineMatches = line.match(regex); + if (lineMatches) { + lineMatches.forEach(match => { + termMatches.push({ + match: match, + lineNumber: lineIndex + 1, + lineContent: line.trim(), + searchType: searchType, + searchLocation: searchLocation, + originalTerm: term || pattern, + description: description, + // Show context around the match in the line + context: line.length > 100 ? + line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), + line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' + : line.trim() + }); + }); + } + }); + if (termMatches.length > 0) { + matches.push({ + term: term || (description || pattern), + searchType: searchType, + searchLocation: searchLocation, + searchIn: searchIn, + pattern: pattern, + matches: termMatches, + count: termMatches.length + }); + } + } + return matches; + } + // Helper function to check if label should be added + async function processLabel(labelName, config) { + const body = context.payload.issue.body || ""; + const title = context.payload.issue.title || ""; + core.notice(`Processing label: ${labelName}`); + core.notice(`Issue Title: "${title}"`); + core.notice(`Issue Body length: ${body.length} characters`); + let shouldAddLabel = false; + let allMatches = []; + let reason = ''; + const keywords = config.keywords || []; + const substrings = config.substrings || []; + const regexPatterns = config.regexPatterns || []; + core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); + // Search in title + if (title.trim()) { + core.notice(`Searching in title: "${title}"`); + const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); + const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); + const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); + allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); + } + // Search in body + if (body.trim()) { + core.notice(`Searching in body (${body.length} characters)`); + const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); + const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); + const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); + allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); + } + if (allMatches.length > 0) { + core.notice(`Found ${allMatches.length} matching term(s):`); + for (const termMatch of allMatches) { + const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; + const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; + if (termMatch.searchType === 'regex') { + core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); + } else { + core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); + } + // Show details for each match + termMatch.matches.forEach((match, index) => { + core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); + if (match.description) { + core.notice(` Description: ${match.description}`); + } + core.notice(` Context: ${match.context}`); + if (match.lineContent !== match.context) { + core.notice(` Full line: ${match.lineContent}`); + } + }); + } + shouldAddLabel = true; + const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); + const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); + const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0); + const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); + const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); + const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); + reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; + } + core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); + core.notice(`Reason: ${reason || 'No matching terms found'}`); + if (shouldAddLabel) { + const existingLabels = context.payload.issue.labels.map(l => l.name); + if (!existingLabels.includes(labelName)) { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: [labelName], + }); + core.notice(`Label "${labelName}" added. ${reason}`); + return true; + } + core.notice(`Label "${labelName}" already present.`); + return false; + } + core.notice(`No matching terms found for label "${labelName}".`); + return false; + } + // Process all configured labels + const labelsAddedResults = await Promise.all( + Object.entries(labelConfig).map(([labelName, config]) => + processLabel(labelName, config).then(added => ({ labelName, added })) + ) + ); + + const numLabelsAdded = labelsAddedResults.filter(r => r.added).length; + core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); + + // Return which labels were added for the next step + const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName); + core.setOutput('labels_added', JSON.stringify(addedLabels)); + return addedLabels; + + - name: CC users for labeled issues + if: steps.label-step.outputs.labels_added != '[]' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + with: + script: | + // Configuration: Map labels to GitHub users to CC + // You can add multiple users per label, and multiple label configurations + const ccConfig = { + rocm: { + users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3'] + message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions + }, + // Add more label -> user mappings here + // Example: + // cuda: { + // users: ['user1', 'user2'], + // message: 'CC {users} for CUDA-related issue' + // }, + // performance: { + // users: ['perfexpert'], + // message: 'CC {users} for performance issue' + // }, + }; + + const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}'); + core.notice(`Labels added: ${labelsAdded.join(', ')}`); + + // Get existing comments to check for already mentioned users + const comments = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const issueBody = context.payload.issue.body || ''; + const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n'); + + // Process each label that was added + for (const label of labelsAdded) { + if (ccConfig[label]) { + const config = ccConfig[label]; + const usersToMention = []; + + // Check which users haven't been mentioned yet + for (const user of config.users) { + const mentionPattern = new RegExp(`@${user}\\b`, 'i'); + if (!mentionPattern.test(allExistingText)) { + usersToMention.push(user); + } else { + core.notice(`@${user} already mentioned for label "${label}", skipping`); + } + } + + // Post comment if there are users to mention + if (usersToMention.length > 0) { + const mentions = usersToMention.map(u => `@${u}`).join(' '); + const message = config.message.replace('{users}', mentions); + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: message + }); + + core.notice(`CC comment added for label "${label}": ${mentions}`); + } else { + core.notice(`All users for label "${label}" already mentioned, skipping comment`); + } + } + } \ No newline at end of file diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml deleted file mode 100644 index 74a7a3a3530f..000000000000 --- a/.github/workflows/lint-and-deploy.yaml +++ /dev/null @@ -1,85 +0,0 @@ -name: Lint and Deploy Charts - -on: pull_request - -permissions: - contents: read - -jobs: - lint-and-deploy: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: Set up Helm - uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 - with: - version: v3.14.4 - - #Python is required because ct lint runs Yamale and yamllint which require Python. - - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 - with: - python-version: '3.13' - - - name: Set up chart-testing - uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0 - with: - version: v3.10.1 - - - name: Run chart-testing (lint) - run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm - - - name: Setup minio - run: | - docker network create vllm-net - docker run -d -p 9000:9000 --name minio --net vllm-net \ - -e "MINIO_ACCESS_KEY=minioadmin" \ - -e "MINIO_SECRET_KEY=minioadmin" \ - -v /tmp/data:/data \ - -v /tmp/config:/root/.minio \ - minio/minio server /data - export AWS_ACCESS_KEY_ID=minioadmin - export AWS_SECRET_ACCESS_KEY=minioadmin - export AWS_EC2_METADATA_DISABLED=true - mkdir opt-125m - cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd .. - aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket - aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive - - - name: Create kind cluster - uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0 - - - name: Build the Docker image vllm cpu - run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env . - - - name: Configuration of docker images, network and namespace for the kind cluster - run: | - docker pull amazon/aws-cli:2.6.4 - kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing - kind load docker-image vllm-cpu-env:latest --name chart-testing - docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")" - kubectl create ns ns-vllm - - - name: Run chart-testing (install) - run: | - export AWS_ACCESS_KEY_ID=minioadmin - export AWS_SECRET_ACCESS_KEY=minioadmin - sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - - - name: curl test - run: | - kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & - sleep 10 - CODE="$(curl -v -f --location http://localhost:8001/v1/completions \ - --header "Content-Type: application/json" \ - --data '{ - "model": "opt-125m", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }'):$CODE" - echo "$CODE" diff --git a/.github/workflows/matchers/markdownlint.json b/.github/workflows/matchers/markdownlint.json new file mode 100644 index 000000000000..fe094a9badb2 --- /dev/null +++ b/.github/workflows/matchers/markdownlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "markdownlint", + "pattern": [ + { + "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] +} \ No newline at end of file diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 8e694d18134e..e21d13b8161f 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -5,6 +5,10 @@ on: push: branches: [main] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + permissions: contents: read @@ -13,10 +17,11 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json" - run: echo "::add-matcher::.github/workflows/matchers/mypy.json" - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 with: diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index bfd02879965e..000000000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,111 +0,0 @@ -# This workflow will upload a Python Package to Release asset -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions - -name: Create Release - -on: - push: - tags: - - v* - -# Needed to create release and upload assets -permissions: - contents: write - -jobs: - release: - # Retrieve tag and create release - name: Create Release - runs-on: ubuntu-latest - outputs: - upload_url: ${{ steps.create_release.outputs.upload_url }} - steps: - - name: Checkout - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Extract branch info - shell: bash - run: | - echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV" - - - name: Create Release - id: create_release - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - env: - RELEASE_TAG: ${{ env.release_tag }} - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" - script: | - const script = require('.github/workflows/scripts/create_release.js') - await script(github, context, core) - - # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. - # wheel: - # name: Build Wheel - # runs-on: ${{ matrix.os }} - # needs: release - - # strategy: - # fail-fast: false - # matrix: - # os: ['ubuntu-20.04'] - # python-version: ['3.9', '3.10', '3.11', '3.12'] - # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt. - # cuda-version: ['11.8', '12.1'] - - # steps: - # - name: Checkout - # uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - # - name: Setup ccache - # uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14 - # with: - # create-symlink: true - # key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} - - # - name: Set up Linux Env - # if: ${{ runner.os == 'Linux' }} - # run: | - # bash -x .github/workflows/scripts/env.sh - - # - name: Set up Python - # uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - # with: - # python-version: ${{ matrix.python-version }} - - # - name: Install CUDA ${{ matrix.cuda-version }} - # run: | - # bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }} - - # - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }} - # run: | - # bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }} - - # - name: Build wheel - # shell: bash - # env: - # CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size - # run: | - # bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} - # wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename) - # asset_name=${wheel_name//"linux"/"manylinux1"} - # echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV" - # echo "asset_name=${asset_name}" >> "$GITHUB_ENV" - - # - name: Upload Release Asset - # uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2 - # env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # with: - # upload_url: ${{ needs.release.outputs.upload_url }} - # asset_path: ./dist/${{ env.wheel_name }} - # asset_name: ${{ env.asset_name }} - # asset_content_type: application/* - - # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested - # - name: Publish package - # uses: pypa/gh-action-pypi-publish@release/v1.8 - # with: - # repository-url: https://test.pypi.org/legacy/ - # password: ${{ secrets.PYPI_API_TOKEN }} - # skip-existing: true diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index 16ae1aadb96b..8884359fa0ce 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -9,19 +9,46 @@ jobs: runs-on: ubuntu-latest steps: - name: Remind to run full CI on PR - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | - github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' + - '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' + - 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' + - 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' + - 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' + - '🚀' - }) + try { + // Get the PR author + const prAuthor = context.payload.pull_request.user.login; + + // Check if this is the author's first PR in this repository + // Use GitHub's search API to find all PRs by this author + const { data: searchResults } = await github.rest.search.issuesAndPullRequests({ + q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`, + per_page: 100 + }); + + const authorPRCount = searchResults.total_count; + + console.log(`Found ${authorPRCount} PRs by ${prAuthor}`); + + // Only post comment if this is the first PR (only one PR by this author) + if (authorPRCount === 1) { + console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`); + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' + + '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' + + 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' + + 'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' + + 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' + + 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' + + 'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' + + '🚀' + }); + } else { + console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`); + } + } catch (error) { + console.error('Error checking PR history or posting comment:', error); + // Don't fail the workflow, just log the error + } env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index 0f010832b465..c69ebbb42da5 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda export MAX_JOBS=1 # Make sure release wheels are built for the following architectures export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" -export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" bash tools/check_repo.sh diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 656f3d3fa7bc..dca3089f496c 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,7 +13,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 + - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/.gitignore b/.gitignore index 96b97a552c54..b1df673e83ca 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* +# triton jit +.triton + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -147,7 +150,8 @@ venv.bak/ # mkdocs documentation /site docs/argparse -docs/examples +docs/examples/* +!docs/examples/README.md # mypy .mypy_cache/ @@ -173,6 +177,14 @@ cython_debug/ # VSCode .vscode/ +# Claude +CLAUDE.md +.claude/ + +# Codex +AGENTS.md +.codex/ + # DS Store .DS_Store @@ -203,3 +215,6 @@ shellcheck*/ # Ignore moe/marlin_moe gen code csrc/moe/marlin_moe_wna16/kernel_* + +# Ignore ep_kernels_workspace folder +ep_kernels_workspace/ diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 000000000000..cd9df57cd980 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,12 @@ +MD007: + indent: 4 +MD013: false +MD024: + siblings_only: true +MD033: false +MD045: false +MD046: false +MD051: false +MD052: false +MD053: false +MD059: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5197820fb402..121bdb750de5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,50 +6,39 @@ default_stages: - manual # Run in CI exclude: 'vllm/third_party/.*' repos: -- repo: https://github.com/google/yapf - rev: v0.43.0 - hooks: - - id: yapf - args: [--in-place, --verbose] - # Keep the same list from yapfignore here to avoid yapf failing without any inputs - exclude: '(.buildkite|benchmarks|build|examples)/.*' - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.7 + rev: v0.14.0 hooks: - - id: ruff + - id: ruff-check args: [--output-format, github, --fix] - id: ruff-format - files: ^(.buildkite|benchmarks|examples)/.* - repo: https://github.com/crate-ci/typos - rev: v1.34.0 + rev: v1.38.1 hooks: - id: typos -- repo: https://github.com/PyCQA/isort - rev: 6.0.1 - hooks: - - id: isort + args: [--force-exclude] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v20.1.3 + rev: v21.1.2 hooks: - id: clang-format exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' types_or: [c++, cuda] args: [--style=file, --verbose] -- repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.29 +- repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.45.0 hooks: - - id: pymarkdown + - id: markdownlint exclude: '.*\.inc\.md' - args: [fix] + stages: [manual] # Only run in CI - repo: https://github.com/rhysd/actionlint rev: v1.7.7 hooks: - id: actionlint - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.6.17 + rev: 0.9.1 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28] files: ^requirements/test\.(in|txt)$ - repo: local hooks: @@ -60,38 +49,32 @@ repos: files: ^requirements/test\.(in|txt)$ - id: mypy-local name: Run mypy for local Python installation - entry: tools/mypy.sh 0 "local" - language: python - types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] + entry: python tools/pre_commit/mypy.py 0 "local" stages: [pre-commit] # Don't run in CI - - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.9 - entry: tools/mypy.sh 1 "3.9" - language: python - types: [python] - additional_dependencies: *mypy_deps - stages: [manual] # Only run in CI + <<: &mypy_common + language: python + types_or: [python, pyi] + require_serial: true + additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 - entry: tools/mypy.sh 1 "3.10" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.10" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.11 - entry: tools/mypy.sh 1 "3.11" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.11" + <<: *mypy_common stages: [manual] # Only run in CI - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.12 - entry: tools/mypy.sh 1 "3.12" - language: python - types: [python] - additional_dependencies: *mypy_deps + entry: python tools/pre_commit/mypy.py 1 "3.12" + <<: *mypy_common + stages: [manual] # Only run in CI + - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.13 + entry: python tools/pre_commit/mypy.py 1 "3.13" + <<: *mypy_common stages: [manual] # Only run in CI - id: shellcheck name: Lint shell scripts @@ -155,18 +138,15 @@ repos: additional_dependencies: [regex] - id: check-pickle-imports name: Prevent new pickle/cloudpickle imports - entry: python tools/check_pickle_imports.py + entry: python tools/pre_commit/check_pickle_imports.py language: python types: [python] - pass_filenames: false - additional_dependencies: [pathspec, regex] + additional_dependencies: [regex] - id: validate-config name: Validate configuration has default values and that each field has a docstring entry: python tools/validate_config.py language: python - types: [python] - pass_filenames: true - files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py + additional_dependencies: [regex] # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 98c3be25f7e7..d83d6df35ed9 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,9 +7,13 @@ build: os: ubuntu-22.04 tools: python: "3.12" + jobs: + post_checkout: + - git fetch --unshallow || true mkdocs: configuration: mkdocs.yaml + fail_on_warning: true # Optionally declare the Python requirements required to build your docs python: diff --git a/.yapfignore b/.yapfignore index 2d6dcf8380ca..38158259032a 100644 --- a/.yapfignore +++ b/.yapfignore @@ -1 +1,2 @@ collect_env.py +vllm/model_executor/layers/fla/ops/*.py diff --git a/CMakeLists.txt b/CMakeLists.txt index edc64f87730a..46630af89f09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26) # cmake --install . --component _C project(vllm_extensions LANGUAGES CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + + # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") @@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. # -set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") +set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") # # Supported/expected torch versions for CUDA/ROCm. @@ -45,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0") # # Try to find python package with an executable that exactly matches @@ -82,6 +86,9 @@ find_package(Torch REQUIRED) # Supported NVIDIA architectures. # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND + CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0") +elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") else() @@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() +# +# Set compression mode for CUDA >=13.x. +# +if(VLLM_GPU_LANG STREQUAL "CUDA" AND + DEFINED CMAKE_CUDA_COMPILER_VERSION AND + CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + list(APPEND VLLM_GPU_FLAGS "--compress-mode=size") +endif() + +# +# Set CUDA include flags for CXX compiler. +# +if(VLLM_GPU_LANG STREQUAL "CUDA") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include") + if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl") + endif() +endif() + # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. @@ -243,13 +269,12 @@ set(VLLM_EXT_SRC "csrc/sampler.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" - "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" - "csrc/quantization/fp8/common.cu" + "csrc/quantization/w8a8/int8/scaled_quant.cu" + "csrc/quantization/w8a8/fp8/common.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/activation_kernels.cu" "csrc/cuda_utils_kernels.cu" - "csrc/prepare_inputs/advance_step.cu" "csrc/custom_all_reduce.cu" "csrc/torch_bindings.cpp") @@ -257,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -287,16 +312,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_MakeAvailable(cutlass) list(APPEND VLLM_EXT_SRC - "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" "csrc/permute_cols.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" + "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" - "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/cutlass_extensions/common.cpp" - "csrc/attention/mla/cutlass_mla_entry.cu") + "csrc/quantization/w8a8/fp8/per_token_group_quant.cu" + "csrc/quantization/w8a8/int8/per_token_group_quant.cu") set_gencode_flags_for_srcs( SRCS "${VLLM_EXT_SRC}" @@ -350,20 +374,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" CUDA_ARCHS "${MARLIN_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) set(MARLIN_SRCS - "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" - "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") set_gencode_flags_for_srcs( SRCS "${MARLIN_SRCS}" CUDA_ARCHS "${MARLIN_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu" + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") + message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" @@ -393,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") + "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -421,11 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require # CUDA 12.8 or later - cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu" + "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu" ) set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -450,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # require CUDA 12.8 or later - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu" + "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu" ) set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -486,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") @@ -528,11 +568,40 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require + # CUDA 12.8 or later + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}") + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) + set(SRCS + "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" + "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${FP4_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1") + message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") + else() + message(STATUS "Not building NVFP4 as no compatible archs were found.") + # clear FP4_ARCHS + set(FP4_ARCHS) + endif() + # FP4 Archs and flags - cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" + "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" "csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu" "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") @@ -540,7 +609,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SRCS "${SRCS}" CUDA_ARCHS "${FP4_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1") list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") else() @@ -550,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # CUTLASS MLA Archs and flags - cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) set(SRCS - "csrc/attention/mla/cutlass_mla_kernels.cu" "csrc/attention/mla/sm100_cutlass_mla_kernel.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -577,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # if it's possible to compile MoE kernels that use its output. cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -595,10 +667,38 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) + set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") + message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) + message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is " + "not >= 12.8, we recommend upgrading to CUDA 12.8 or later " + "if you intend on running FP8 quantized MoE models on Blackwell.") + else() + message(STATUS "Not building grouped_mm_c3x as no compatible archs found " + "in CUDA target architectures.") + endif() + endif() + # moe_data.cu is used by all CUTLASS MoE kernels. - cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}") @@ -614,10 +714,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "in CUDA target architectures.") endif() endif() - - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -704,6 +808,44 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "found in CUDA target architectures") endif() endif() + + # Only build W4A8 kernels if we are building for something compatible with sm90a + cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS) + set(SRCS + "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu") + + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${W4A8_ARCHS}") + + list(APPEND VLLM_EXT_SRC "${SRCS}") + + message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 + AND W4A8_ARCHS) + message(STATUS "Not building W4A8 kernels as CUDA Compiler version is " + "not >= 12.0, we recommend upgrading to CUDA 12.0 or " + "later if you intend on running w4a16 quantized models on " + "Hopper.") + else() + message(STATUS "Not building W4A8 kernels as no compatible archs " + "found in CUDA target architectures") + endif() + endif() + + # Hadacore kernels + cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}") + if(HADACORE_ARCHS) + set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${HADACORE_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + message(STATUS "Building hadacore") + endif() + # if CUDA endif endif() @@ -741,10 +883,21 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" + "csrc/moe/moe_lora_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") - list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu") + list(APPEND VLLM_MOE_EXT_SRC + "csrc/moe/moe_wna16.cu" + "csrc/moe/grouped_topk_kernels.cu") +endif() + +if(VLLM_GPU_LANG STREQUAL "CUDA") + set(MOE_PERMUTE_SRC + "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu" + "csrc/moe/moe_permute_unpermute_op.cu") + + list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}") endif() set_gencode_flags_for_srcs( @@ -805,6 +958,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set_gencode_flags_for_srcs( SRCS "${MOE_WNAA16_MARLIN_SRC}" CUDA_ARCHS "${MARLIN_MOE_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MOE_WNAA16_MARLIN_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) @@ -815,17 +972,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() -if(VLLM_GPU_LANG STREQUAL "CUDA") - set(MOE_PERMUTE_SRC - "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu" - "csrc/moe/moe_permute_unpermute_op.cu") - - set_gencode_flags_for_srcs( - SRCS "${MARLIN_PERMUTE_SRC}" - CUDA_ARCHS "${MOE_PERMUTE_ARCHS}") - - list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}") -endif() message(STATUS "Enabling moe extension.") define_gpu_extension_target( _moe_C @@ -862,6 +1008,7 @@ endif() # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) + include(cmake/external_projects/qutlass.cmake) # vllm-flash-attn should be last as it overwrites some CMake functions include(cmake/external_projects/vllm_flash_attn.cmake) diff --git a/MANIFEST.in b/MANIFEST.in index 82fd22b845f0..fb3cccbb4a9c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,7 +2,6 @@ include LICENSE include requirements/common.txt include requirements/cuda.txt include requirements/rocm.txt -include requirements/neuron.txt include requirements/cpu.txt include CMakeLists.txt diff --git a/README.md b/README.md index dc2f0afbe353..3dcdd7dc0094 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +

@@ -13,17 +14,27 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |

+--- +Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year! + --- *Latest News* 🔥 -- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing). + +- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing). +- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). +- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). +- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/). -- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News +- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). +- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). +- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing). +- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing). - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing). - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). @@ -46,6 +57,7 @@ Easy, fast, and cheap LLM serving for everyone
--- + ## About vLLM is a fast and easy-to-use library for LLM inference and serving. @@ -70,11 +82,12 @@ vLLM is flexible and easy to use with: - Tensor, pipeline, data and expert parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron +- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Prefix caching support - Multi-LoRA support vLLM seamlessly supports most popular open-source models on HuggingFace, including: + - Transformer-like LLMs (e.g., Llama) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) - Embedding Models (e.g., E5-Mistral) @@ -91,6 +104,7 @@ pip install vllm ``` Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. + - [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html) - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) @@ -107,6 +121,7 @@ vLLM is a community project. Our compute resources for development and testing a Cash Donations: + - a16z - Dropbox - Sequoia Capital @@ -114,6 +129,8 @@ Cash Donations: - ZhenFund Compute Resources: + +- Alibaba Cloud - AMD - Anyscale - AWS @@ -132,6 +149,7 @@ Compute Resources: - Trainy - UC Berkeley - UC San Diego +- Volcengine Slack Sponsor: Anyscale @@ -153,7 +171,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Contact Us -- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) +- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature diff --git a/RELEASE.md b/RELEASE.md index 9352e7ef706c..db0d51afc7be 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -60,9 +60,10 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI. **Current Coverage:** + * Models: Llama3, Llama4, and Mixtral * Hardware: NVIDIA H100 and AMD MI300x -* *Note: Coverage may change based on new model releases and hardware availability* +* _Note: Coverage may change based on new model releases and hardware availability_ **Performance Validation Process:** @@ -71,11 +72,13 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github **Step 2: Review Benchmark Setup** Familiarize yourself with the benchmark configurations: + * [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda) * [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm) **Step 3: Run the Benchmark** Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure: + * **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`) * **vLLM commit**: Set to the RC commit hash diff --git a/SECURITY.md b/SECURITY.md index 6053cfb41f35..d6319cdb1ac2 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,13 +1,50 @@ # Security Policy -## Reporting a Vulnerability +## Reporting security issues -If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). +## Issue triage ---- +Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). + +## Threat model Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. + +## Issue severity + +We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories: + +### CRITICAL Severity + +Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0. + +### HIGH Severity + +Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9 + +### MODERATE Severity + +Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9 + +### LOW Severity + +Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0 + +## Prenotification policy + +For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues. + +* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release. + +* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis. + +* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications + * Substantial internal deployment leveraging the upstream vLLM project. + * Established internal security teams and comprehensive compliance measures. + * Active and consistent contributions to the upstream vLLM project. + +* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included. diff --git a/benchmarks/README.md b/benchmarks/README.md index fb8690d42db9..269a4d51ec2e 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,605 +1,20 @@ -# Benchmarking vLLM +# Benchmarks -This README guides you through running benchmark tests with the extensive -datasets supported on vLLM. It’s a living document, updated as new features and datasets -become available. +This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation. -**Dataset Overview** +## Contents - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DatasetOnlineOfflineData Path
ShareGPTwget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
BurstGPTwget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
SonnetLocal file: benchmarks/sonnet.txt
Randomsynthetic
HuggingFace-VisionArenalmarena-ai/VisionArena-Chat
HuggingFace-InstructCoderlikaixin/InstructCoder
HuggingFace-AIMOAI-MO/aimo-validation-aime , AI-MO/NuminaMath-1.5, AI-MO/NuminaMath-CoT
HuggingFace-Otherlmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered
CustomLocal file: data.jsonl
+- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput) +- **Throughput benchmarks**: Scripts for testing offline batch inference performance +- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference +- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.) -✅: supported +## Usage -🟡: Partial support +For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli). -🚧: to be supported +For full CLI reference see: -**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` - ---- -
-🚀 Example - Online Benchmark - -
- -First start serving your model - -```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests -``` - -Then run the benchmarking script - -```bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_serving.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --endpoint /v1/completions \ - --dataset-name sharegpt \ - --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --num-prompts 10 -``` - -If successful, you will see the following output - -``` -============ Serving Benchmark Result ============ -Successful requests: 10 -Benchmark duration (s): 5.78 -Total input tokens: 1369 -Total generated tokens: 2212 -Request throughput (req/s): 1.73 -Output token throughput (tok/s): 382.89 -Total Token throughput (tok/s): 619.85 ----------------Time to First Token---------------- -Mean TTFT (ms): 71.54 -Median TTFT (ms): 73.88 -P99 TTFT (ms): 79.49 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 7.91 -Median TPOT (ms): 7.96 -P99 TPOT (ms): 8.03 ----------------Inter-token Latency---------------- -Mean ITL (ms): 7.74 -Median ITL (ms): 7.70 -P99 ITL (ms): 8.39 -================================================== -``` - -**Custom Dataset** - -If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl - -``` -{"prompt": "What is the capital of India?"} -{"prompt": "What is the capital of Iran?"} -{"prompt": "What is the capital of China?"} -``` - -```bash -# start server -VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests -``` - -```bash -# run benchmarking script -python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ - --backend vllm \ - --model meta-llama/Llama-3.1-8B-Instruct \ - --endpoint /v1/completions \ - --dataset-name custom \ - --dataset-path \ - --custom-skip-chat-template \ - --num-prompts 80 \ - --max-concurrency 1 \ - --temperature=0.3 \ - --top-p=0.75 \ - --result-dir "./log/" -``` - -You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. - -**VisionArena Benchmark for Vision Language Models** - -```bash -# need a model with vision capability here -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests -``` - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path lmarena-ai/VisionArena-Chat \ - --hf-split train \ - --num-prompts 1000 -``` - -**InstructCoder Benchmark with Speculative Decoding** - -``` bash -VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --speculative-config $'{"method": "ngram", - "num_speculative_tokens": 5, "prompt_lookup_max": 5, - "prompt_lookup_min": 2}' -``` - -``` bash -python3 benchmarks/benchmark_serving.py \ - --model meta-llama/Meta-Llama-3-8B-Instruct \ - --dataset-name hf \ - --dataset-path likaixin/InstructCoder \ - --num-prompts 2048 -``` - -**Other HuggingFaceDataset Examples** - -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests -``` - -**`lmms-lab/LLaVA-OneVision-Data`** - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path lmms-lab/LLaVA-OneVision-Data \ - --hf-split train \ - --hf-subset "chart2text(cauldron)" \ - --num-prompts 10 -``` - -**`Aeala/ShareGPT_Vicuna_unfiltered`** - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend openai-chat \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --endpoint /v1/chat/completions \ - --dataset-name hf \ - --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ - --hf-split train \ - --num-prompts 10 -``` - -**`AI-MO/aimo-validation-aime`** - -``` bash -python3 vllm/benchmarks/benchmark_serving.py \ - --model Qwen/QwQ-32B \ - --dataset-name hf \ - --dataset-path AI-MO/aimo-validation-aime \ - --num-prompts 10 \ - --seed 42 -``` - -**`philschmid/mt-bench`** - -``` bash -python3 vllm/benchmarks/benchmark_serving.py \ - --model Qwen/QwQ-32B \ - --dataset-name hf \ - --dataset-path philschmid/mt-bench \ - --num-prompts 80 -``` - -**Running With Sampling Parameters** - -When using OpenAI-compatible backends such as `vllm`, optional sampling -parameters can be specified. Example client command: - -```bash -python3 vllm/benchmarks/benchmark_serving.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --endpoint /v1/completions \ - --dataset-name sharegpt \ - --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --top-k 10 \ - --top-p 0.9 \ - --temperature 0.5 \ - --num-prompts 10 -``` - -**Running With Ramp-Up Request Rate** - -The benchmark tool also supports ramping up the request rate over the -duration of the benchmark run. This can be useful for stress testing the -server or finding the maximum throughput that it can handle, given some latency budget. - -Two ramp-up strategies are supported: -- `linear`: Increases the request rate linearly from a start value to an end value. -- `exponential`: Increases the request rate exponentially. - -The following arguments can be used to control the ramp-up: -- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`). -- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark. -- `--ramp-up-end-rps`: The request rate at the end of the benchmark. - -
- -
-📈 Example - Offline Throughput Benchmark - -
- -```bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset-name sonnet \ - --dataset-path vllm/benchmarks/sonnet.txt \ - --num-prompts 10 -``` - -If successful, you will see the following output - -``` -Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s -Total num prompt tokens: 5014 -Total num output tokens: 1500 -``` - -**VisionArena Benchmark for Vision Language Models** - -``` bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path lmarena-ai/VisionArena-Chat \ - --num-prompts 1000 \ - --hf-split train -``` - -The `num prompt tokens` now includes image token counts - -``` -Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s -Total num prompt tokens: 14527 -Total num output tokens: 1280 -``` - -**InstructCoder Benchmark with Speculative Decoding** - -``` bash -VLLM_WORKER_MULTIPROC_METHOD=spawn \ -VLLM_USE_V1=1 \ -python3 vllm/benchmarks/benchmark_throughput.py \ - --dataset-name=hf \ - --dataset-path=likaixin/InstructCoder \ - --model=meta-llama/Meta-Llama-3-8B-Instruct \ - --input-len=1000 \ - --output-len=100 \ - --num-prompts=2048 \ - --async-engine \ - --speculative-config $'{"method": "ngram", - "num_speculative_tokens": 5, "prompt_lookup_max": 5, - "prompt_lookup_min": 2}' -``` - -``` -Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s -Total num prompt tokens: 261136 -Total num output tokens: 204800 -``` - -**Other HuggingFaceDataset Examples** - -**`lmms-lab/LLaVA-OneVision-Data`** - -```bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path lmms-lab/LLaVA-OneVision-Data \ - --hf-split train \ - --hf-subset "chart2text(cauldron)" \ - --num-prompts 10 -``` - -**`Aeala/ShareGPT_Vicuna_unfiltered`** - -```bash -python3 vllm/benchmarks/benchmark_throughput.py \ - --model Qwen/Qwen2-VL-7B-Instruct \ - --backend vllm-chat \ - --dataset-name hf \ - --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ - --hf-split train \ - --num-prompts 10 -``` - -**`AI-MO/aimo-validation-aime`** - -```bash -python3 benchmarks/benchmark_throughput.py \ - --model Qwen/QwQ-32B \ - --backend vllm \ - --dataset-name hf \ - --dataset-path AI-MO/aimo-validation-aime \ - --hf-split train \ - --num-prompts 10 -``` - -**Benchmark with LoRA Adapters** - -``` bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 vllm/benchmarks/benchmark_throughput.py \ - --model meta-llama/Llama-2-7b-hf \ - --backend vllm \ - --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \ - --dataset_name sharegpt \ - --num-prompts 10 \ - --max-loras 2 \ - --max-lora-rank 8 \ - --enable-lora \ - --lora-path yard1/llama-2-7b-sql-lora-test - ``` - -
- -
-🛠️ Example - Structured Output Benchmark - -
- -Benchmark the performance of structured output generation (JSON, grammar, regex). - -**Server Setup** - -```bash -vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests -``` - -**JSON Schema Benchmark** - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset json \ - --structured-output-ratio 1.0 \ - --request-rate 10 \ - --num-prompts 1000 -``` - -**Grammar-based Generation Benchmark** - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset grammar \ - --structure-type grammar \ - --request-rate 10 \ - --num-prompts 1000 -``` - -**Regex-based Generation Benchmark** - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset regex \ - --request-rate 10 \ - --num-prompts 1000 -``` - -**Choice-based Generation Benchmark** - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset choice \ - --request-rate 10 \ - --num-prompts 1000 -``` - -**XGrammar Benchmark Dataset** - -```bash -python3 benchmarks/benchmark_serving_structured_output.py \ - --backend vllm \ - --model NousResearch/Hermes-3-Llama-3.1-8B \ - --dataset xgrammar_bench \ - --request-rate 10 \ - --num-prompts 1000 -``` - -
- -
-📚 Example - Long Document QA Benchmark - -
- -Benchmark the performance of long document question-answering with prefix caching. - -**Basic Long Document QA Test** - -```bash -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 16 \ - --document-length 2000 \ - --output-len 50 \ - --repeat-count 5 -``` - -**Different Repeat Modes** - -```bash -# Random mode (default) - shuffle prompts randomly -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 8 \ - --document-length 3000 \ - --repeat-count 3 \ - --repeat-mode random - -# Tile mode - repeat entire prompt list in sequence -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 8 \ - --document-length 3000 \ - --repeat-count 3 \ - --repeat-mode tile - -# Interleave mode - repeat each prompt consecutively -python3 benchmarks/benchmark_long_document_qa_throughput.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-documents 8 \ - --document-length 3000 \ - --repeat-count 3 \ - --repeat-mode interleave -``` - -
- -
-🗂️ Example - Prefix Caching Benchmark - -
- -Benchmark the efficiency of automatic prefix caching. - -**Fixed Prompt with Prefix Caching** - -```bash -python3 benchmarks/benchmark_prefix_caching.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --enable-prefix-caching \ - --num-prompts 1 \ - --repeat-count 100 \ - --input-length-range 128:256 -``` - -**ShareGPT Dataset with Prefix Caching** - -```bash -# download dataset -# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -python3 benchmarks/benchmark_prefix_caching.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \ - --enable-prefix-caching \ - --num-prompts 20 \ - --repeat-count 5 \ - --input-length-range 128:256 -``` - -
- -
-⚡ Example - Request Prioritization Benchmark - -
- -Benchmark the performance of request prioritization in vLLM. - -**Basic Prioritization Test** - -```bash -python3 benchmarks/benchmark_prioritization.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --input-len 128 \ - --output-len 64 \ - --num-prompts 100 \ - --scheduling-policy priority -``` - -**Multiple Sequences per Prompt** - -```bash -python3 benchmarks/benchmark_prioritization.py \ - --model meta-llama/Llama-2-7b-chat-hf \ - --input-len 128 \ - --output-len 64 \ - --num-prompts 100 \ - --scheduling-policy priority \ - --n 2 -``` - -
+- +- +- diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index 7732f50b1d22..d1bdb4c43f10 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -3,6 +3,7 @@ This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate. ## Table of Contents + - [Prerequisites](#prerequisites) - [Configuration](#configuration) - [How to Run](#how-to-run) @@ -30,6 +31,12 @@ cd vllm You must set the following variables at the top of the script before execution. + Note: You can also override the default values below via environment variables when running the script. + +```bash +MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh +``` + | Variable | Description | Example Value | | --- | --- | --- | | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` | @@ -39,6 +46,7 @@ You must set the following variables at the top of the script before execution. | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) | | `INPUT_LEN` | **Required.** Request input length. | `4000` | | `OUTPUT_LEN` | **Required.** Request output length. | `16` | +| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` | | `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` | | `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` | | `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` | @@ -51,7 +59,7 @@ You must set the following variables at the top of the script before execution. 1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section. 2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost. -``` +```bash cd bash auto_tune.sh ``` @@ -63,34 +71,40 @@ bash auto_tune.sh Here are a few examples of how to configure the script for different goals: ### 1. Maximize Throughput (No Latency Constraint) + - **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens. - **Configuration**: ```bash INPUT_LEN=1800 OUTPUT_LEN=20 +MAX_MODEL_LEN=2048 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number ``` #### 2. Maximize Throughput with a Latency Requirement + - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. - **Configuration**: ```bash INPUT_LEN=1800 OUTPUT_LEN=20 +MAX_MODEL_LEN=2048 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=500 ``` #### 3. Maximize Throughput with Prefix Caching and Latency Requirements + - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. - **Configuration**: ```bash INPUT_LEN=1800 OUTPUT_LEN=20 +MAX_MODEL_LEN=2048 MIN_CACHE_HIT_PCT=60 MAX_LATENCY_ALLOWED_MS=500 ``` @@ -101,11 +115,11 @@ After the script finishes, you will find the results in a new, timestamped direc - **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run: - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination. - - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run. + - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run. - **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found. -``` +```text # Example result.txt content hash:a1b2c3d4... max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8 @@ -135,3 +149,70 @@ The script follows a systematic process to find the optimal parameters: 4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far. 5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard. + +## Batched `auto_tune` + +The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file. + +### Prerequisites + +- **jq**: This script requires `jq` to parse the JSON configuration file. +- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated. + +### How to Run + +1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run. + +2. **Execute the script**: + + ```bash + bash batch_auto_tune.sh [gcs_upload_path] + ``` + + - ``: **Required.** Path to your JSON configuration file. + - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`). + +### Configuration File + +The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run. + +Here is an example `runs_config.json` with two benchmark configurations: + +```json +[ + { + "base": "/home/user", + "model": "meta-llama/Llama-3.1-8B-Instruct", + "system": "TPU", # OR GPU + "tp": 8, + "input_len": 128, + "output_len": 2048, + "max_model_len": 2300, + "num_seqs_list": "128 256", + "num_batched_tokens_list": "8192 16384" + }, + { + "base": "/home/user", + "model": "meta-llama/Llama-3.1-70B-Instruct", + "system": "TPU", # OR GPU + "tp": 8, + "input_len": 4000, + "output_len": 16, + "max_model_len": 4096, + "num_seqs_list": "64 128", + "num_batched_tokens_list": "4096 8192", + "max_latency_allowed_ms": 500 + } +] +``` + +### Output + +The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added: + +- `run_id`: A unique identifier for the run, derived from the timestamp. +- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`). +- `results`: The content of the `result.txt` file from the `auto_tune.sh` run. +- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided). + +A summary of successful and failed runs is also printed to the console upon completion. diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 159ee1421475..56b721cbb402 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -1,27 +1,45 @@ #!/bin/bash -# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. +# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. # See details in README (benchmarks/auto_tune/README.md). TAG=$(date +"%Y_%m_%d_%H_%M") -BASE="" -MODEL="meta-llama/Llama-3.1-8B-Instruct" -SYSTEM="TPU" -TP=1 -DOWNLOAD_DIR="" -INPUT_LEN=4000 -OUTPUT_LEN=16 -MIN_CACHE_HIT_PCT=0 -MAX_LATENCY_ALLOWED_MS=100000000000 -NUM_SEQS_LIST="128 256" -NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096" +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO} +BASE=${BASE:-"$SCRIPT_DIR/../../.."} +MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"} +SYSTEM=${SYSTEM:-"TPU"} +TP=${TP:-1} +DOWNLOAD_DIR=${DOWNLOAD_DIR:-""} +INPUT_LEN=${INPUT_LEN:-4000} +OUTPUT_LEN=${OUTPUT_LEN:-16} +MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096} +MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0} +MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000} +NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"} +NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"} LOG_FOLDER="$BASE/auto-benchmark/$TAG" RESULT="$LOG_FOLDER/result.txt" PROFILE_PATH="$LOG_FOLDER/profile" -echo "result file: $RESULT" -echo "model: $MODEL" +echo "====================== AUTO TUNE PARAMETERS ====================" +echo "SCRIPT_DIR=$SCRIPT_DIR" +echo "BASE=$BASE" +echo "MODEL=$MODEL" +echo "SYSTEM=$SYSTEM" +echo "TP=$TP" +echo "DOWNLOAD_DIR=$DOWNLOAD_DIR" +echo "INPUT_LEN=$INPUT_LEN" +echo "OUTPUT_LEN=$OUTPUT_LEN" +echo "MAX_MODEL_LEN=$MAX_MODEL_LEN" +echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT" +echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS" +echo "NUM_SEQS_LIST=$NUM_SEQS_LIST" +echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST" +echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL" +echo "RESULT_FILE=$RESULT" +echo "====================== AUTO TUNEPARAMETERS ====================" rm -rf $LOG_FOLDER rm -rf $PROFILE_PATH @@ -36,10 +54,18 @@ current_hash=$(git rev-parse HEAD) echo "hash:$current_hash" >> "$RESULT" echo "current_hash: $current_hash" +TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN)) +RED='\033[0;31m' +if (( TOTAL_LEN > MAX_MODEL_LEN )); then + echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2 + exit 1 +fi + best_throughput=0 best_max_num_seqs=0 best_num_batched_tokens=0 best_goodput=0 +best_request_rate=0 start_server() { local gpu_memory_utilization=$1 @@ -47,26 +73,47 @@ start_server() { local max_num_batched_tokens=$3 local vllm_log=$4 local profile_dir=$5 - - pkill -f vllm - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ - --disable-log-requests \ - --port 8004 \ - --gpu-memory-utilization $gpu_memory_utilization \ - --max-num-seqs $max_num_seqs \ - --max-num-batched-tokens $max_num_batched_tokens \ - --tensor-parallel-size $TP \ - --enable-prefix-caching \ - --load-format dummy \ - --download-dir "$DOWNLOAD_DIR" \ - --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 & + pkill -if "vllm serve" || true + + # Define the common arguments as a bash array. + # Each argument and its value are separate elements. + local common_args_array=( + "$MODEL" + "--disable-log-requests" + "--port" "8004" + "--gpu-memory-utilization" "$gpu_memory_utilization" + "--max-num-seqs" "$max_num_seqs" + "--max-num-batched-tokens" "$max_num_batched_tokens" + "--tensor-parallel-size" "$TP" + "--enable-prefix-caching" + "--load-format" "dummy" + "--download-dir" "$DOWNLOAD_DIR" + "--max-model-len" "$MAX_MODEL_LEN" + ) + + # Use the array expansion "${common_args_array[@]}" + # This correctly passes each element as a separate argument. + if [[ -n "$profile_dir" ]]; then + # Start server with profiling enabled + VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ + vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + else + # Start server without profiling + VLLM_SERVER_DEV_MODE=1 \ + vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + fi + local server_pid=$! # wait for 10 minutes... server_started=0 - for i in {1..60}; do + for i in {1..60}; do + # This line checks whether the server is still alive or not, + # since that we should always have permission to send signal to the server process. + kill -0 $server_pid 2> /dev/null || break + RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) - STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) + STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) if [[ "$STATUS_CODE" -eq 200 ]]; then server_started=1 break @@ -74,45 +121,29 @@ start_server() { sleep 10 fi done + if (( ! server_started )); then - echo "server did not start within 10 minutes. Please check server log at $vllm_log". + echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log". return 1 else return 0 fi } -update_best_profile() { - local profile_dir=$1 - local profile_index=$2 - sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort)) - selected_profile_file= - if [[ "$SYSTEM" == "TPU" ]]; then - selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb" - fi - if [[ "$SYSTEM" == "GPU" ]]; then - selected_profile_file="${sorted_paths[$profile_index]}" - fi - rm -f $PROFILE_PATH/* - cp $selected_profile_file $PROFILE_PATH -} - run_benchmark() { local max_num_seqs=$1 local max_num_batched_tokens=$2 local gpu_memory_utilization=$3 echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" - local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}" echo "vllm_log: $vllm_log" echo rm -f $vllm_log - mkdir -p $profile_dir - pkill -f vllm - local profile_index=0 + pkill -if "vllm serve" || true echo "starting server..." - start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir + # Call start_server without a profile_dir to avoid profiling overhead + start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log "" result=$? if [[ "$result" -eq 1 ]]; then echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" @@ -120,17 +151,19 @@ run_benchmark() { echo "server started." fi echo - + echo "run benchmark test..." meet_latency_requirement=0 # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) - python benchmarks/benchmark_serving.py \ + adjusted_input_len=$(( INPUT_LEN - prefix_len )) + # --profile flag is removed from this call + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ - --random-input-len $INPUT_LEN \ + --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ @@ -139,8 +172,7 @@ run_benchmark() { --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --num-prompts 1000 \ --random-prefix-len $prefix_len \ - --port 8004 \ - --profile &> "$bm_log" + --port 8004 &> "$bm_log" throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') @@ -154,16 +186,15 @@ run_benchmark() { # start from request-rate as int(throughput) + 1 request_rate=$((${throughput%.*} + 1)) while ((request_rate > 0)); do - profile_index=$((profile_index+1)) # clear prefix cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache sleep 5 bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" - python benchmarks/benchmark_serving.py \ + vllm bench serve \ --backend vllm \ --model $MODEL \ --dataset-name random \ - --random-input-len $INPUT_LEN \ + --random-input-len $adjusted_input_len \ --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ @@ -192,12 +223,7 @@ run_benchmark() { best_max_num_seqs=$max_num_seqs best_num_batched_tokens=$max_num_batched_tokens best_goodput=$goodput - if [[ "$SYSTEM" == "TPU" ]]; then - update_best_profile "$profile_dir/plugins/profile" $profile_index - fi - if [[ "$SYSTEM" == "GPU" ]]; then - update_best_profile "$profile_dir" $profile_index - fi + best_request_rate=$request_rate fi else echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" @@ -206,9 +232,9 @@ run_benchmark() { echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" - pkill vllm + pkill -if "vllm serve" || true sleep 10 - printf '=%.0s' $(seq 1 20) + echo "====================" return 0 } @@ -219,7 +245,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST" gpu_memory_utilization=0.98 find_gpu_memory_utilization=0 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do - start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" + # Pass empty string for profile_dir argument + start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" result=$? if [[ "$result" -eq 0 ]]; then find_gpu_memory_utilization=1 @@ -242,6 +269,45 @@ for num_seqs in "${num_seqs_list[@]}"; do done done echo "finish permutations" + +# ================================================================================= +# FINAL PROFILING RUN FOR THE BEST CONFIGURATION +# ================================================================================= +if (( $(echo "$best_throughput > 0" | bc -l) )); then + echo + echo "Benchmark tuning finished. Now running profiling on the best configuration found..." + echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput" + echo + + vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt" + bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt" + + # Start server with the best params and profiling ENABLED + echo "Starting server for profiling..." + start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH" + + # Run benchmark with the best params and the --profile flag + echo "Running benchmark with profiling..." + prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) + adjusted_input_len=$(( INPUT_LEN - prefix_len )) + vllm bench serve \ + --backend vllm \ + --model $MODEL \ + --dataset-name random \ + --random-input-len $adjusted_input_len \ + --random-output-len $OUTPUT_LEN \ + --ignore-eos \ + --disable-tqdm \ + --request-rate $best_request_rate \ + --percentile-metrics ttft,tpot,itl,e2el \ + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --num-prompts 100 \ + --random-prefix-len $prefix_len \ + --port 8004 \ + --profile &> "$bm_log" +else + echo "No configuration met the latency requirements. Skipping final profiling run." +fi +pkill -if "vllm serve" || true echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" - diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh new file mode 100755 index 000000000000..57ef20daf6b7 --- /dev/null +++ b/benchmarks/auto_tune/batch_auto_tune.sh @@ -0,0 +1,128 @@ +#!/bin/bash + +INPUT_JSON="$1" +GCS_PATH="$2" # Optional GCS path for uploading results for each run + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh" + +if [[ -z "$INPUT_JSON" ]]; then + echo "Error: Input JSON file not provided." + echo "Usage: $0 [gcs_upload_path]" + exit 1 +fi + +if [[ ! -f "$INPUT_JSON" ]]; then + echo "Error: File not found at '$INPUT_JSON'" + exit 1 +fi + +if ! command -v jq &> /dev/null; then + echo "Error: 'jq' command not found. Please install jq to process the JSON input." + exit 1 +fi + +if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then + echo "Error: 'gcloud' command not found, but a GCS_PATH was provided." + exit 1 +fi + +SUCCESS_COUNT=0 +FAILURE_COUNT=0 +FAILED_RUNS=() +SCRIPT_START_TIME=$(date +%s) + +json_content=$(cat "$INPUT_JSON") +if ! num_runs=$(echo "$json_content" | jq 'length'); then + echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2 + exit 1 +fi + +echo "Found $num_runs benchmark configurations in $INPUT_JSON." +echo "Starting benchmark runs..." +echo "--------------------------------------------------" + +for i in $(seq 0 $(($num_runs - 1))); do + run_object=$(echo "$json_content" | jq ".[$i]") + + RUN_START_TIME=$(date +%s) + ENV_VARS_ARRAY=() + # Dynamically create env vars from the JSON object's keys + for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do + value=$(echo "$run_object" | jq -r ".$key") + var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_') + ENV_VARS_ARRAY+=("${var_name}=${value}") + done + + echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}" + + # Execute auto_tune.sh and capture output + RUN_OUTPUT_FILE=$(mktemp) + if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then + STATUS="SUCCESS" + ((SUCCESS_COUNT++)) + else + STATUS="FAILURE" + ((FAILURE_COUNT++)) + FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)") + fi + + RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE") + rm "$RUN_OUTPUT_FILE" + + # Parse results and optionally upload them to GCS + RUN_ID="" + RESULTS="" + GCS_RESULTS_URL="" + if [[ "$STATUS" == "SUCCESS" ]]; then + RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true) + + if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then + RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")") + RESULT_DIR=$(dirname "$RESULT_FILE_PATH") + RESULTS=$(cat "$RESULT_FILE_PATH") + + if [[ -n "$GCS_PATH" ]]; then + GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}" + echo "Uploading results to GCS..." + if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then + echo "GCS upload successful." + else + echo "Warning: GCS upload failed for RUN_ID $RUN_ID." + fi + fi + else + echo "Warning: Could not find result file for a successful run." + STATUS="WARNING_NO_RESULT_FILE" + fi + fi + + # Add the results back into the JSON object for this run + json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \ + '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}') + + RUN_END_TIME=$(date +%s) + echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS" + echo "--------------------------------------------------" + + # Save intermediate progress back to the file + echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON" + +done + +SCRIPT_END_TIME=$(date +%s) +echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds." +echo +echo "====================== SUMMARY ======================" +echo "Successful runs: $SUCCESS_COUNT" +echo "Failed runs: $FAILURE_COUNT" +echo "===================================================" + +if [[ $FAILURE_COUNT -gt 0 ]]; then + echo "Details of failed runs (see JSON file for full parameters):" + for failed in "${FAILED_RUNS[@]}"; do + echo " - $failed" + done +fi + +echo "Updated results have been saved to '$INPUT_JSON'." diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index c7229dbb8e90..4021fede7215 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -8,7 +8,6 @@ import time import traceback from dataclasses import dataclass, field -from typing import Optional, Union import aiohttp import huggingface_hub.constants @@ -28,12 +27,13 @@ class RequestFuncInput: prompt_len: int output_len: int model: str - model_name: Optional[str] = None - logprobs: Optional[int] = None - extra_body: Optional[dict] = None - multi_modal_content: Optional[dict] = None + model_name: str | None = None + logprobs: int | None = None + extra_body: dict | None = None + multi_modal_content: dict | list[dict] | None = None ignore_eos: bool = False - language: Optional[str] = None + language: str | None = None + request_id: str | None = None @dataclass @@ -51,7 +51,7 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -71,6 +71,9 @@ async def async_request_tgi( "inputs": request_func_input.prompt, "parameters": params, } + headers = None + if request_func_input.request_id: + headers = {"x-request-id": request_func_input.request_id} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len if request_func_input.ignore_eos: @@ -82,7 +85,9 @@ async def async_request_tgi( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() @@ -127,7 +132,7 @@ async def async_request_tgi( async def async_request_trt_llm( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -145,6 +150,9 @@ async def async_request_trt_llm( } if request_func_input.ignore_eos: payload["min_length"] = request_func_input.output_len + headers = None + if request_func_input.request_id: + headers = {"x-request-id": request_func_input.request_id} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -152,7 +160,9 @@ async def async_request_trt_llm( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() @@ -193,7 +203,7 @@ async def async_request_trt_llm( async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -211,6 +221,8 @@ async def async_request_deepspeed_mii( "top_p": 1.0, } headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -254,7 +266,7 @@ async def async_request_deepspeed_mii( async def async_request_openai_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -283,6 +295,8 @@ async def async_request_openai_completions( if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -352,7 +366,7 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("chat/completions", "profile")), ( @@ -364,7 +378,15 @@ async def async_request_openai_chat_completions( ) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: - content.append(request_func_input.multi_modal_content) + mm_content = request_func_input.multi_modal_content + if isinstance(mm_content, list): + content.extend(mm_content) + elif isinstance(mm_content, dict): + content.append(mm_content) + else: + raise TypeError( + "multi_modal_content must be a dict or list[dict] for openai-chat" + ) payload = { "model": request_func_input.model_name if request_func_input.model_name @@ -387,6 +409,8 @@ async def async_request_openai_chat_completions( "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -451,7 +475,7 @@ async def async_request_openai_chat_completions( async def async_request_openai_audio( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: # Lazy import without PlaceholderModule to avoid vllm dep. import soundfile @@ -483,6 +507,8 @@ async def async_request_openai_audio( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id # Send audio file def to_bytes(y, sr): @@ -491,7 +517,10 @@ def to_bytes(y, sr): buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + mm_audio = request_func_input.multi_modal_content + if not isinstance(mm_audio, dict) or "audio" not in mm_audio: + raise TypeError("multi_modal_content must be a dict containing 'audio'") + with to_bytes(*mm_audio["audio"]) as f: form = aiohttp.FormData() form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): @@ -580,7 +609,7 @@ def get_tokenizer( tokenizer_mode: str = "auto", trust_remote_code: bool = False, **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +) -> PreTrainedTokenizer | PreTrainedTokenizerFast: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path ): diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py new file mode 100644 index 000000000000..5434f8b6a4e4 --- /dev/null +++ b/benchmarks/benchmark_block_pool.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc + +from benchmark_utils import TimeCollector +from tabulate import tabulate + +from vllm.utils import FlexibleArgumentParser +from vllm.v1.core.block_pool import BlockPool + + +def main(args): + rows = [] + for allocate_block in args.allocate_blocks: + # Enforce a GC collect ahead to minimize the impact among runs + gc.collect() + block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) + + get_blocks_times = TimeCollector(TimeCollector.US) + free_blocks_times = TimeCollector(TimeCollector.US) + for _ in range(args.num_iteration): + with get_blocks_times: + blocks = block_pool.get_new_blocks(allocate_block) + with free_blocks_times: + block_pool.free_blocks(blocks) + + rows.append( + [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block] + + get_blocks_times.dump_avg_max() + + free_blocks_times.dump_avg_max() + ) + + print( + tabulate( + rows, + headers=[ + "Iterations", + "Total\nBlocks", + "Allocated\nBlocks", + "Get Blocks\nAvg (us)", + "Get Blocks\nMax (us)", + "Free Blocks\nAvg (us)", + "Free Blocks\nMax (us)", + ], + tablefmt="grid", + floatfmt=".3f", + ) + ) + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of BlockPool for KV Cache." + ) + parser.add_argument("--num-gpu-blocks", type=int, default=100000) + parser.add_argument( + "--num-iteration", + type=int, + default=1000, + help="Number of iterations to run to stabilize final data readings", + ) + parser.add_argument( + "--allocate-blocks", + type=int, + nargs="*", + default=[10, 50, 100, 500, 1000], + help="Number of blocks to allocate", + ) + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py deleted file mode 100644 index 1ad6cef7a9db..000000000000 --- a/benchmarks/benchmark_dataset.py +++ /dev/null @@ -1,1173 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This module defines a framework for sampling benchmark requests from various -datasets. Each dataset subclass of BenchmarkDataset must implement sample -generation. Supported dataset types include: - - ShareGPT - - Random (synthetic) - - Sonnet - - BurstGPT - - HuggingFace - - VisionArena -""" - -import base64 -import io -import json -import logging -import random -from abc import ABC, abstractmethod -from collections.abc import Mapping -from dataclasses import dataclass -from functools import cache -from io import BytesIO -from typing import Any, Callable, Optional, Union - -import numpy as np -import pandas as pd -from datasets import load_dataset -from PIL import Image -from transformers import PreTrainedTokenizerBase - -from vllm.lora.request import LoRARequest -from vllm.lora.utils import get_adapter_absolute_path -from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.image import convert_image_mode -from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer - -logger = logging.getLogger(__name__) - -# ----------------------------------------------------------------------------- -# Data Classes -# ----------------------------------------------------------------------------- - - -@dataclass -class SampleRequest: - """ - Represents a single inference request for benchmarking. - """ - - prompt: Union[str, Any] - prompt_len: int - expected_output_len: int - multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None - lora_request: Optional[LoRARequest] = None - - -# ----------------------------------------------------------------------------- -# Benchmark Dataset Base Class -# ----------------------------------------------------------------------------- - - -class BenchmarkDataset(ABC): - DEFAULT_SEED = 0 - IS_MULTIMODAL = False - - def __init__( - self, - dataset_path: Optional[str] = None, - random_seed: int = DEFAULT_SEED, - ) -> None: - """ - Initialize the BenchmarkDataset with an optional dataset path and random - seed. Args: - dataset_path (Optional[str]): Path to the dataset. If None, it - indicates that a default or random dataset might be used. - random_seed (int): Seed value for reproducible shuffling or - sampling. Defaults to DEFAULT_SEED. - """ - self.dataset_path = dataset_path - # Set the random seed, ensuring that a None value is replaced with the - # default seed. - self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED - self.data = None - - def apply_multimodal_chat_transformation( - self, prompt: str, mm_content: Optional[MultiModalDataDict] = None - ) -> list[dict]: - """ - Transform a prompt and optional multimodal content into a chat format. - This method is used for chat models that expect a specific conversation - format. - """ - content = [{"text": prompt, "type": "text"}] - if mm_content is not None: - content.append(mm_content) - return [{"role": "user", "content": content}] - - def load_data(self) -> None: - """ - Load data from the dataset path into self.data. - - This method must be overridden by subclasses since the method to load - data will vary depending on the dataset format and source. - - Raises: - NotImplementedError: If a subclass does not implement this method. - """ - # TODO (jenniferzhao): add support for downloading data - raise NotImplementedError("load_data must be implemented in subclasses.") - - def get_random_lora_request( - self, - tokenizer: PreTrainedTokenizerBase, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - ) -> tuple[Optional[LoRARequest], AnyTokenizer]: - """ - Optionally select a random LoRA request and return its associated - tokenizer. - - This method is used when LoRA parameters are provided. It randomly - selects a LoRA based on max_loras and retrieves a cached tokenizer for - that LoRA if available. Otherwise, it returns the base tokenizer. - - Args: - tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no - LoRA is selected. max_loras (Optional[int]): The maximum number of - LoRAs available. If None, LoRA is not used. lora_path - (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA - is not used. - - Returns: - tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first - element is a LoRARequest (or None if not applicable) and the second - element is the tokenizer associated with the LoRA request (or the - base tokenizer). - """ - if max_loras is None or lora_path is None: - return None, tokenizer - - # Generate a random LoRA ID in the range [1, max_loras]. - lora_id = random.randint(1, max_loras) - lora_request = LoRARequest( - lora_name=str(lora_id), - lora_int_id=lora_id, - lora_path=lora_path_on_disk(lora_path), - ) - if lora_id not in lora_tokenizer_cache: - lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request) - # Return lora_request and the cached tokenizer if available; otherwise, - # return the base tokenizer - return lora_request, lora_tokenizer_cache[lora_id] or tokenizer - - @abstractmethod - def sample( - self, tokenizer: PreTrainedTokenizerBase, num_requests: int - ) -> list[SampleRequest]: - """ - Abstract method to generate sample requests from the dataset. - - Subclasses must override this method to implement dataset-specific logic - for generating a list of SampleRequest objects. - - Args: - tokenizer (PreTrainedTokenizerBase): The tokenizer to be used - for processing the dataset's text. - num_requests (int): The number of sample requests to generate. - - Returns: - list[SampleRequest]: A list of sample requests generated from the - dataset. - """ - raise NotImplementedError("sample must be implemented in subclasses.") - - def maybe_oversample_requests( - self, requests: list[SampleRequest], num_requests: int - ) -> None: - """ - Oversamples the list of requests if its size is less than the desired - number. - - Args: - requests (List[SampleRequest]): The current list of sampled - requests. num_requests (int): The target number of requests. - """ - if len(requests) < num_requests: - random.seed(self.random_seed) - additional = random.choices(requests, k=num_requests - len(requests)) - requests.extend(additional) - logger.info("Oversampled requests to reach %d total samples.", num_requests) - - -# ----------------------------------------------------------------------------- -# Utility Functions and Global Caches -# ----------------------------------------------------------------------------- - - -def is_valid_sequence( - prompt_len: int, - output_len: int, - min_len: int = 4, - max_prompt_len: int = 1024, - max_total_len: int = 2048, - skip_min_output_len_check: bool = False, -) -> bool: - """ - Validate a sequence based on prompt and output lengths. - - Default pruning criteria are copied from the original `sample_hf_requests` - and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as - from `sample_requests` in benchmark_throughput.py. - """ - # Check for invalid conditions - prompt_too_short = prompt_len < min_len - output_too_short = (not skip_min_output_len_check) and (output_len < min_len) - prompt_too_long = prompt_len > max_prompt_len - combined_too_long = (prompt_len + output_len) > max_total_len - - # Return True if none of the invalid conditions are met - return not ( - prompt_too_short or output_too_short or prompt_too_long or combined_too_long - ) - - -@cache -def lora_path_on_disk(lora_path: str) -> str: - return get_adapter_absolute_path(lora_path) - - -# Global cache for LoRA tokenizers. -lora_tokenizer_cache: dict[int, AnyTokenizer] = {} - - -def process_image(image: Any) -> Mapping[str, Any]: - """ - Process a single image input and return a multimedia content dictionary. - - Supports three input types: - - 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key - containing raw image data. - Loads the bytes as a PIL.Image.Image. - - 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as - a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns - a dictionary with the image as a base64 data URL. - - 3. String input: - Treats the string as a URL or local file path. - - Prepends "file://" if the string doesn't start with "http://" or - "file://". - Returns a dictionary with the image URL. - - Raises: - ValueError: If the input is not a supported type. - """ - if isinstance(image, dict) and "bytes" in image: - image = Image.open(BytesIO(image["bytes"])) - if isinstance(image, Image.Image): - image = convert_image_mode(image, "RGB") - with io.BytesIO() as image_data: - image.save(image_data, format="JPEG") - image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") - return { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, - } - - if isinstance(image, str): - image_url = ( - image if image.startswith(("http://", "file://")) else f"file://{image}" - ) - return {"type": "image_url", "image_url": {"url": image_url}} - - raise ValueError( - f"Invalid image input {image}. Must be a PIL.Image.Image" - " or str or dictionary with raw image bytes." - ) - - -# ----------------------------------------------------------------------------- -# Random Dataset Implementation (Synthetic Data) -# ----------------------------------------------------------------------------- - - -class RandomDataset(BenchmarkDataset): - # Default values copied from benchmark_serving.py for the random dataset. - DEFAULT_PREFIX_LEN = 0 - DEFAULT_RANGE_RATIO = 0.0 - DEFAULT_INPUT_LEN = 1024 - DEFAULT_OUTPUT_LEN = 128 - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - range_ratio: float = DEFAULT_RANGE_RATIO, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - **kwargs, - ) -> list[SampleRequest]: - # Enforce range_ratio < 1 - assert range_ratio < 1.0, ( - "random_range_ratio must be < 1.0 to ensure a valid sampling range" - ) - - vocab_size = tokenizer.vocab_size - num_special_tokens = tokenizer.num_special_tokens_to_add() - real_input_len = input_len - num_special_tokens - - prefix_token_ids = ( - np.random.randint(0, vocab_size, size=prefix_len).tolist() - if prefix_len > 0 - else [] - ) - - # New sampling logic: [X * (1 - b), X * (1 + b)] - input_low = int(real_input_len * (1 - range_ratio)) - input_high = int(real_input_len * (1 + range_ratio)) - output_low = int(output_len * (1 - range_ratio)) - # Ensure the lower bound for output length is at least 1 to prevent - # sampling 0 tokens, which can cause request failures. - output_low = max(output_low, 1) - output_high = int(output_len * (1 + range_ratio)) - - # Add logging for debugging - logger.info("Sampling input_len from [%s, %s]", input_low, input_high) - logger.info("Sampling output_len from [%s, %s]", output_low, output_high) - - input_lens = np.random.randint(input_low, input_high + 1, size=num_requests) - output_lens = np.random.randint(output_low, output_high + 1, size=num_requests) - offsets = np.random.randint(0, vocab_size, size=num_requests) - - requests = [] - for i in range(num_requests): - inner_seq = ( - (offsets[i] + i + np.arange(input_lens[i])) % vocab_size - ).tolist() - token_sequence = prefix_token_ids + inner_seq - prompt = tokenizer.decode(token_sequence) - # After decoding the prompt we have to encode and decode it again. - # This is done because in some cases N consecutive tokens - # give a string tokenized into != N number of tokens. - # For example for GPT2Tokenizer: - # [6880, 6881] -> ['Ġcalls', 'here'] -> - # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] - # To avoid uncontrolled change of the prompt length, - # the encoded sequence is truncated before being decode again. - total_input_len = prefix_len + int(input_lens[i]) - re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ - :total_input_len - ] - prompt = tokenizer.decode(re_encoded_sequence) - total_input_len = len(re_encoded_sequence) - requests.append( - SampleRequest( - prompt=prompt, - prompt_len=total_input_len, - expected_output_len=int(output_lens[i]), - ) - ) - return requests - - -# ----------------------------------------------------------------------------- -# ShareGPT Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ShareGPTDataset(BenchmarkDataset): - """ - Implements the ShareGPT dataset. Loads data from a JSON file and generates - sample requests based on conversation turns. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - with open(self.dataset_path, encoding="utf-8") as f: - self.data = json.load(f) - # Filter entries with at least two conversation turns. - self.data = [ - entry - for entry in self.data - if "conversations" in entry and len(entry["conversations"]) >= 2 - ] - random.seed(self.random_seed) - random.shuffle(self.data) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - samples: list = [] - for entry in self.data: - if len(samples) >= num_requests: - break - prompt, completion = ( - entry["conversations"][0]["value"], - entry["conversations"][1]["value"], - ) - - lora_request, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path - ) - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - new_output_len = len(completion_ids) if output_len is None else output_len - if not is_valid_sequence( - prompt_len, - new_output_len, - skip_min_output_len_check=output_len is not None, - ): - continue - if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation(prompt, None) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=new_output_len, - lora_request=lora_request, - ) - ) - self.maybe_oversample_requests(samples, num_requests) - return samples - - -# ----------------------------------------------------------------------------- -# Custom Dataset Implementation -# ----------------------------------------------------------------------------- - - -class CustomDataset(BenchmarkDataset): - """ - Implements the Custom dataset. Loads data from a JSONL file and generates - sample requests based on conversation turns. E.g., - ``` - {"prompt": "What is the capital of India?"} - {"prompt": "What is the capital of Iran?"} - {"prompt": "What is the capital of China?"} - ``` - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - # self.data will be a list of dictionaries - # e.g., [{"prompt": "What is the capital of India?"}, ...] - # This will be the standardized format which load_data() - # has to convert into depending on the filetype of dataset_path. - # sample() will assume this standardized format of self.data - self.data = [] - - # Load the JSONL file - if self.dataset_path.endswith(".jsonl"): - jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) - - # check if the JSONL file has a 'prompt' column - if "prompt" not in jsonl_data.columns: - raise ValueError("JSONL file must contain a 'prompt' column.") - - # Convert each row to a dictionary and append to self.data - # This will convert the DataFrame to a list of dictionaries - # where each dictionary corresponds to a row in the DataFrame. - # This is the standardized format we want for self.data - for _, row in jsonl_data.iterrows(): - self.data.append(row.to_dict()) - else: - raise NotImplementedError( - "Only JSONL format is supported for CustomDataset." - ) - - random.seed(self.random_seed) - random.shuffle(self.data) - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - lora_path: Optional[str] = None, - max_loras: Optional[int] = None, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - skip_chat_template: bool = False, - **kwargs, - ) -> list: - sampled_requests = [] - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt = item["prompt"] - - # apply template - if not skip_chat_template: - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Sonnet Dataset Implementation -# ----------------------------------------------------------------------------- - - -class SonnetDataset(BenchmarkDataset): - """ - Simplified implementation of the Sonnet dataset. Loads poem lines from a - text file and generates sample requests. Default values here copied from - `benchmark_serving.py` for the sonnet dataset. - """ - - DEFAULT_PREFIX_LEN = 200 - DEFAULT_INPUT_LEN = 550 - DEFAULT_OUTPUT_LEN = 150 - - def __init__( - self, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data(self) -> None: - if not self.dataset_path: - raise ValueError("dataset_path must be provided.") - with open(self.dataset_path, encoding="utf-8") as f: - self.data = f.readlines() - - def sample( - self, - tokenizer, - num_requests: int, - prefix_len: int = DEFAULT_PREFIX_LEN, - input_len: int = DEFAULT_INPUT_LEN, - output_len: int = DEFAULT_OUTPUT_LEN, - return_prompt_formatted: bool = False, - **kwargs, - ) -> list: - # Calculate average token length for a poem line. - tokenized_lines = [tokenizer(line).input_ids for line in self.data] - avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines) - - # Build the base prompt. - base_prompt = "Pick as many lines as you can from these poem lines:\n" - base_msg = [{"role": "user", "content": base_prompt}] - base_fmt = tokenizer.apply_chat_template( - base_msg, add_generation_prompt=True, tokenize=False - ) - base_offset = len(tokenizer(base_fmt).input_ids) - if input_len <= base_offset: - raise ValueError( - f"'input_len' must be higher than the base prompt length " - f"({base_offset})." - ) - - # Determine how many poem lines to use. - num_input_lines = round((input_len - base_offset) / avg_len) - num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0) - prefix_lines = self.data[:num_prefix_lines] - - samples = [] - while len(samples) < num_requests: - extra_lines = random.choices( - self.data, k=num_input_lines - num_prefix_lines - ) - prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}" - msg = [{"role": "user", "content": prompt}] - prompt_formatted = tokenizer.apply_chat_template( - msg, add_generation_prompt=True, tokenize=False - ) - prompt_len = len(tokenizer(prompt_formatted).input_ids) - if prompt_len <= input_len: - samples.append( - SampleRequest( - prompt=prompt_formatted if return_prompt_formatted else prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - return samples - - -# ----------------------------------------------------------------------------- -# BurstGPT Dataset Implementation -# ----------------------------------------------------------------------------- - - -class BurstGPTDataset(BenchmarkDataset): - """ - Implements the BurstGPT dataset. Loads data from a CSV file and generates - sample requests based on synthetic prompt generation. Only rows with Model - "GPT-4" and positive response tokens are used. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.load_data() - - def load_data( - self, - ): - if self.dataset_path is None: - raise ValueError("dataset_path must be provided for loading data.") - - df = pd.read_csv(self.dataset_path) - # Filter to keep only GPT-4 rows. - gpt4_df = df[df["Model"] == "GPT-4"] - # Remove failed requests (where Response tokens is 0 or less). - gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0] - # Sample the desired number of rows. - self.data = gpt4_df - - def _sample_loaded_data(self, num_requests: int) -> list: - if num_requests <= len(self.data): - data = self.data.sample(n=num_requests, random_state=self.random_seed) - else: - data = self.data.sample( - n=num_requests, - random_state=self.random_seed, - replace=True, - ) - # Convert the dataframe to a list of lists. - return data.values.tolist() - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - max_loras: Optional[int] = None, - lora_path: Optional[str] = None, - **kwargs, - ) -> list[SampleRequest]: - samples = [] - data = self._sample_loaded_data(num_requests=num_requests) - for i in range(num_requests): - input_len = int(data[i][2]) - output_len = int(data[i][3]) - lora_req, tokenizer = self.get_random_lora_request( - tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path - ) - vocab_size = tokenizer.vocab_size - # Generate a synthetic prompt: a list of token IDs computed as (i + - # j) modulo vocab_size. - token_ids = [(i + j) % vocab_size for j in range(input_len)] - prompt = tokenizer.decode(token_ids) - samples.append( - SampleRequest( - prompt=prompt, - prompt_len=input_len, - expected_output_len=output_len, - lora_request=lora_req, - ) - ) - return samples - - -# ----------------------------------------------------------------------------- -# HuggingFace Dataset Base Implementation -# ----------------------------------------------------------------------------- -class HuggingFaceDataset(BenchmarkDataset): - """Base class for datasets hosted on HuggingFace.""" - - SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set() - - def __init__( - self, - dataset_path: str, - dataset_split: str, - no_stream: bool = False, - dataset_subset: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(dataset_path=dataset_path, **kwargs) - - self.dataset_split = dataset_split - self.dataset_subset = dataset_subset - self.load_stream = not no_stream - self.load_data() - - def load_data(self) -> None: - """Load data from HuggingFace datasets.""" - self.data = load_dataset( - self.dataset_path, - name=self.dataset_subset, - split=self.dataset_split, - streaming=self.load_stream, - ) - self.data = self.data.shuffle(seed=self.random_seed) - - -# ----------------------------------------------------------------------------- -# Conversation Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ConversationDataset(HuggingFaceDataset): - """Dataset for conversation data with multimodal support.""" - - SUPPORTED_DATASET_PATHS = { - "lmms-lab/LLaVA-OneVision-Data", - "Aeala/ShareGPT_Vicuna_unfiltered", - } - IS_MULTIMODAL = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - # Filter examples with at least 2 conversations - filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) - sampled_requests = [] - dynamic_output = output_len is None - - for item in filtered_data: - if len(sampled_requests) >= num_requests: - break - conv = item["conversations"] - prompt, completion = conv[0]["value"], conv[1]["value"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence(prompt_len, completion_len): - continue - mm_content = process_image(item["image"]) if "image" in item else None - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len and output len - prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Vision Arena Dataset Implementation -# ----------------------------------------------------------------------------- - - -class VisionArenaDataset(HuggingFaceDataset): - """ - Vision Arena Dataset. - """ - - DEFAULT_OUTPUT_LEN = 128 - SUPPORTED_DATASET_PATHS = { - "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"], - "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"], - } - IS_MULTIMODAL = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - for item in self.data: - if len(sampled_requests) >= num_requests: - break - parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) - if parser_fn is None: - raise ValueError(f"Unsupported dataset path: {self.dataset_path}") - prompt = parser_fn(item) - mm_content = process_image(item["images"][0]) - prompt_len = len(tokenizer(prompt).input_ids) - if enable_multimodal_chat: - # Note: when chat is enabled the request prompt_len is no longer - # accurate and we will be using request output to count the - # actual prompt len - prompt = self.apply_multimodal_chat_transformation(prompt, mm_content) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Instruct Coder Dataset Implementation -# ----------------------------------------------------------------------------- - - -class InstructCoderDataset(HuggingFaceDataset): - """ - InstructCoder Dataset. - https://huggingface.co/datasets/likaixin/InstructCoder - - InstructCoder is the dataset designed for general code editing. It consists - of 114,239 instruction-input-output triplets, and covers multiple distinct - code editing scenario. - """ - - DEFAULT_OUTPUT_LEN = 200 # this is the average default output length - SUPPORTED_DATASET_PATHS = { - "likaixin/InstructCoder", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt = f"{item['input']}\n\n{item['instruction']} Just output \ - the code, do not include any explanation." - - # apply template - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# MT-Bench Dataset Implementation -# ----------------------------------------------------------------------------- - - -class MTBenchDataset(HuggingFaceDataset): - """ - MT-Bench Dataset. - https://huggingface.co/datasets/philschmid/mt-bench - - We create a single turn dataset for MT-Bench. - This is similar to Spec decoding benchmark setup in vLLM - https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 - """ # noqa: E501 - - DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM - SUPPORTED_DATASET_PATHS = { - "philschmid/mt-bench", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - enable_multimodal_chat: bool = False, - **kwargs, - ) -> list: - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - sampled_requests = [] - - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt = item["turns"][0] - - # apply template - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# AIMO Dataset Implementation -# ----------------------------------------------------------------------------- - - -class AIMODataset(HuggingFaceDataset): - """ - Dataset class for processing a AIMO dataset with reasoning questions. - """ - - SUPPORTED_DATASET_PATHS = { - "AI-MO/aimo-validation-aime", - "AI-MO/NuminaMath-1.5", - "AI-MO/NuminaMath-CoT", - } - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - **kwargs, - ) -> list: - sampled_requests = [] - dynamic_output = output_len is None - - for item in self.data: - if len(sampled_requests) >= num_requests: - break - prompt, completion = item["problem"], item["solution"] - - prompt_ids = tokenizer(prompt).input_ids - completion_ids = tokenizer(completion).input_ids - prompt_len = len(prompt_ids) - completion_len = len(completion_ids) - output_len = completion_len if dynamic_output else output_len - assert isinstance(output_len, int) and output_len > 0 - if dynamic_output and not is_valid_sequence( - prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000 - ): - continue - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=None, - ) - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests - - -# ----------------------------------------------------------------------------- -# Next Edit Prediction Dataset Implementation -# ----------------------------------------------------------------------------- - - -zeta_prompt = """### Instruction: -You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location. - -### User Edits: - -{} - -### User Excerpt: - -{} - -### Response: - -""" # noqa: E501 - - -def _format_zeta_prompt( - sample: dict, original_start_marker: str = "<|editable_region_start|>" -) -> dict: - """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. - - This function formats examples from the NEP dataset - into prompts and expected outputs. It could be - further extended to support more NEP datasets. - - Args: - sample: The dataset sample containing events, - inputs, and outputs. - original_start_marker: The marker indicating the - start of the editable region. Defaults to - "<|editable_region_start|>". - - Returns: - A dictionary with the formatted prompts and expected outputs. - """ - events = sample["events"] - input = sample["input"] - output = sample["output"] - prompt = zeta_prompt.format(events, input) - - # following the original implementation, extract the focused region - # from the raw output - output_start_index = output.find(original_start_marker) - output_focused_region = output[output_start_index:] - expected_output = output_focused_region - - return {"prompt": prompt, "expected_output": expected_output} - - -class NextEditPredictionDataset(HuggingFaceDataset): - """ - Dataset class for processing a Next Edit Prediction dataset. - """ - - SUPPORTED_DATASET_PATHS = { - "zed-industries/zeta", - } - MAPPING_PROMPT_FUNCS = { - "zed-industries/zeta": _format_zeta_prompt, - } - - def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs): - formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path) - if formatting_prompt_func is None: - raise ValueError(f"Unsupported dataset path: {self.dataset_path}") - samples = [] - for sample in self.data: - sample = formatting_prompt_func(sample) - samples.append( - SampleRequest( - prompt=sample["prompt"], - prompt_len=len(tokenizer(sample["prompt"]).input_ids), - expected_output_len=len( - tokenizer(sample["expected_output"]).input_ids - ), - ) - ) - if len(samples) >= num_requests: - break - self.maybe_oversample_requests(samples, num_requests) - return samples - - -# ----------------------------------------------------------------------------- -# ASR Dataset Implementation -# ----------------------------------------------------------------------------- - - -class ASRDataset(HuggingFaceDataset): - """ - Dataset class for processing a ASR dataset for transcription. - Tested on the following set: - - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | Dataset | Domain | Speaking Style | hf-subset | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - | TED-LIUM | TED talks | Oratory | release1, release2, release3| - | | | | release3-speaker-adaptation | - | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | - | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | - | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | - | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | - | AMI | Meetings | Spontaneous | ihm, sdm | - +----------------+----------------------------------------+--------------------------+-----------------------------+ - - """ # noqa: E501 - - SUPPORTED_DATASET_PATHS = { - "openslr/librispeech_asr", - "facebook/voxpopuli", - "LIUM/tedlium", - "edinburghcstr/ami", - "speechcolab/gigaspeech", - "kensho/spgispeech", - } - - DEFAULT_OUTPUT_LEN = 128 - IS_MULTIMODAL = True - - # TODO Whisper-specific. Abstract interface when more models are supported. - TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" - skip_long_audios: bool = True - - def sample( - self, - tokenizer: PreTrainedTokenizerBase, - num_requests: int, - output_len: Optional[int] = None, - **kwargs, - ) -> list: - import librosa - - output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - prompt = ASRDataset.TRANSCRIPTION_PREAMBLE - prompt_len = len(tokenizer(prompt).input_ids) - sampled_requests = [] - skipped = 0 - for item in self.data: - if len(sampled_requests) >= num_requests: - break - audio = item["audio"] - y, sr = audio["array"], audio["sampling_rate"] - duration_s = librosa.get_duration(y=y, sr=sr) - # Whisper max supported duration - if self.skip_long_audios and duration_s > 30: - skipped += 1 - continue - - mm_content = {"audio": (y, sr)} - sampled_requests.append( - SampleRequest( - prompt=prompt, - prompt_len=prompt_len, - expected_output_len=output_len, - multi_modal_data=mm_content, - ) - ) - if skipped: - logger.warning( - "%d samples discarded from dataset due to" - " their length being greater than" - " what Whisper supports.", - skipped, - ) - self.maybe_oversample_requests(sampled_requests, num_requests) - return sampled_requests diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 4d2ea126b24a..a7892f3f7124 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,186 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark the latency of processing a single batch of requests.""" +import sys -import argparse -import dataclasses -import json -import os -import time -from typing import Any, Optional - -import numpy as np -from tqdm import tqdm - -import vllm.envs as envs -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptType -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={"latency": results["latencies"]}, - extra_info={k: results[k] for k in ["avg_latency", "percentiles"]}, - ) - if pt_records: - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def main(args: argparse.Namespace): - print(args) - - engine_args = EngineArgs.from_cli_args(args) - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM(**dataclasses.asdict(engine_args)) - assert llm.llm_engine.model_config.max_model_len >= ( - args.input_len + args.output_len - ), ( - "Please ensure that max_model_len is greater than" - " the sum of input_len and output_len." - ) - - sampling_params = SamplingParams( - n=args.n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=args.output_len, - detokenize=not args.disable_detokenize, - ) - print(sampling_params) - dummy_prompt_token_ids = np.random.randint( - 10000, size=(args.batch_size, args.input_len) - ) - dummy_prompts: list[PromptType] = [ - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() - ] - - def llm_generate(): - if not args.use_beam_search: - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) - else: - llm.beam_search( - dummy_prompts, - BeamSearchParams( - beam_width=args.n, - max_tokens=args.output_len, - ignore_eos=True, - ), - ) - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - llm.start_profile() - llm_generate() - llm.stop_profile() - else: - start_time = time.perf_counter() - llm_generate() - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion(profile_dir=None) - - if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=profile_dir) - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90, 99] - percentiles = np.percentile(latencies, percentages) - print(f"Avg latency: {np.mean(latencies)} seconds") - for percentage, percentile in zip(percentages, percentiles): - print(f"{percentage}% percentile latency: {percentile} seconds") - - # Output JSON results if specified - if args.output_json: - results = { - "avg_latency": np.mean(latencies), - "latencies": latencies.tolist(), - "percentiles": dict(zip(percentages, percentiles.tolist())), - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--n", - type=int, - default=1, - help="Number of generated sequences per prompt.", - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-iters-warmup", - type=int, - default=10, - help="Number of iterations to run for warmup.", - ) - parser.add_argument( - "--num-iters", type=int, default=30, help="Number of iterations to run." - ) - parser.add_argument( - "--profile", - action="store_true", - help="profile the generation process of a single batch", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the latency results in JSON format.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize responses (i.e. do not include " - "detokenization time in the latency measurement)" - ), - ) - - parser = EngineArgs.add_cli_args(parser) - # V1 enables prefix caching by default which skews the latency - # numbers. We need to disable prefix caching by default. - parser.set_defaults(enable_prefix_caching=False) +if __name__ == "__main__": + print("""DEPRECATED: This script has been moved to the vLLM CLI. - return parser +Please use the following command instead: + vllm bench latency +For help with the new command, run: + vllm bench latency --help -if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) - main(args) +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench latency --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py new file mode 100644 index 000000000000..626b150ee4ce --- /dev/null +++ b/benchmarks/benchmark_ngram_proposer.py @@ -0,0 +1,213 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc +import time +from unittest import mock + +import numpy as np +from benchmark_utils import TimeCollector +from tabulate import tabulate + +from vllm.config import ( + CacheConfig, + DeviceConfig, + LoadConfig, + ModelConfig, + ParallelConfig, + SchedulerConfig, + SpeculativeConfig, + VllmConfig, +) +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser +from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.worker.gpu_input_batch import InputBatch +from vllm.v1.worker.gpu_model_runner import GPUModelRunner + + +def benchmark_propose(args): + rows = [] + for max_ngram in args.max_ngram: + collector = TimeCollector(TimeCollector.US) + + model_config = ModelConfig( + model="facebook/opt-125m", + task="generate", + max_model_len=args.num_token + args.num_spec_token, + tokenizer="facebook/opt-125m", + tokenizer_mode="auto", + dtype="auto", + seed=None, + trust_remote_code=False, + ) + proposer = NgramProposer( + vllm_config=VllmConfig( + model_config=model_config, + speculative_config=SpeculativeConfig( + prompt_lookup_min=args.min_ngram, + prompt_lookup_max=max_ngram, + num_speculative_tokens=args.num_spec_token, + method="ngram", + ), + ) + ) + + # Warm up + proposer.propose(np.random.randint(0, 20, (args.num_token,))) + + gc.collect() + for _ in range(args.num_iteration): + tokens = np.random.randint(0, 20, (args.num_req, args.num_token)) + with collector: + for i in range(args.num_req): + proposer.propose(tokens[i, :]) + rows.append( + [args.num_req, args.num_token, args.min_ngram, max_ngram] + + collector.dump_avg_max() + ) + + print( + tabulate( + rows, + headers=[ + "# Request", + "# Token", + "Min Ngram", + "Max Ngram", + "Avg (us)", + "Max (us)", + ], + tablefmt="grid", + floatfmt=".3f", + ) + ) + + +def benchmark_batched_propose(args): + NUM_SPECULATIVE_TOKENS_NGRAM = 10 + PROMPT_LOOKUP_MIN = 5 + PROMPT_LOOKUP_MAX = 15 + MAX_MODEL_LEN = int(1e7) + DEVICE = current_platform.device_type + + model_config = ModelConfig(model="facebook/opt-125m", runner="generate") + + speculative_config = SpeculativeConfig( + target_model_config=model_config, + target_parallel_config=ParallelConfig(), + method="ngram", + num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM, + prompt_lookup_max=PROMPT_LOOKUP_MAX, + prompt_lookup_min=PROMPT_LOOKUP_MIN, + ) + + vllm_config = VllmConfig( + model_config=model_config, + cache_config=CacheConfig(), + speculative_config=speculative_config, + device_config=DeviceConfig(device=current_platform.device_type), + parallel_config=ParallelConfig(), + load_config=LoadConfig(), + scheduler_config=SchedulerConfig(), + ) + + # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group + mock_pp_group = mock.MagicMock() + mock_pp_group.world_size = 1 + with mock.patch( + "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group + ): + runner = GPUModelRunner(vllm_config, DEVICE) + + # hack max model len + runner.max_model_len = MAX_MODEL_LEN + runner.drafter.max_model_len = MAX_MODEL_LEN + + dummy_input_batch = InputBatch( + max_num_reqs=args.num_req, + max_model_len=MAX_MODEL_LEN, + max_num_batched_tokens=args.num_req * args.num_token, + device=DEVICE, + pin_memory=False, + vocab_size=256000, + block_sizes=[16], + ) + dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req)) + dummy_input_batch.spec_decode_unsupported_reqs = () + dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req + dummy_input_batch.token_ids_cpu = np.random.randint( + 0, 20, (args.num_req, args.num_token) + ) + + runner.input_batch = dummy_input_batch + + sampled_token_ids = [[0]] * args.num_req + + print("Starting benchmark") + # first run is warmup so ignore it + for _ in range(args.num_iteration): + start = time.time() + runner.drafter.propose( + sampled_token_ids, + dummy_input_batch.req_ids, + dummy_input_batch.num_tokens_no_spec, + dummy_input_batch.token_ids_cpu, + dummy_input_batch.spec_decode_unsupported_reqs, + ) + end = time.time() + print(f"Iteration time (s): {end - start}") + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of N-gram speculative decode drafting" + ) + parser.add_argument( + "--batched", action="store_true", help="consider time to prepare batch" + ) + parser.add_argument( + "--num-iteration", + type=int, + default=100, + help="Number of iterations to run to stabilize final data readings", + ) + parser.add_argument( + "--num-req", type=int, default=128, help="Number of requests in the batch" + ) + parser.add_argument( + "--num-token", type=int, default=1500, help="Number of tokens for each request" + ) + parser.add_argument( + "--min-ngram", + type=int, + default=3, + help="Minimum n-gram to match", + ) + parser.add_argument( + "--max-ngram", + type=int, + nargs="*", + default=[5, 7, 10, 15, 20], + help="Maximum n-gram to match", + ) + parser.add_argument( + "--num-spec-token", + type=int, + default=3, + help="Number of speculative tokens to generate", + ) + args = parser.parse_args() + + if not args.batched: + benchmark_propose(args) + else: + benchmark_batched_propose(args) + + +""" +# Example command lines: +# time python3 benchmarks/benchmark_ngram_proposer.py +# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128 +""" # noqa: E501 +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index b5e2613de1cd..d7dc0e991c4d 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -32,7 +32,6 @@ import json import random import time -from typing import Optional from transformers import PreTrainedTokenizerBase @@ -80,7 +79,7 @@ def sample_requests_from_dataset( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -128,7 +127,7 @@ def sample_requests_from_random( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, prefix_len: int, ) -> list[Request]: requests = [] diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index bb453791c186..769f52dbab6e 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -7,7 +7,6 @@ import json import random import time -from typing import Optional from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -24,7 +23,7 @@ def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[tuple[str, int, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index f3a20842137e..76cf51498020 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,1384 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -r"""Benchmark online serving throughput. +import sys -On the server side, run one of the following commands: - vLLM OpenAI API server - vllm serve \ - --swap-space 16 \ - --disable-log-requests - -On the client side, run: - python benchmarks/benchmark_serving.py \ - --backend \ - --model \ - --dataset-name sharegpt \ - --dataset-path \ - --request-rate \ # By default is inf - --num-prompts # By default is 1000 - - when using tgi backend, add - --endpoint /generate_stream - to the end of the command above. -""" - -import argparse -import asyncio -import gc -import json -import os -import random -import time -import warnings -from collections.abc import AsyncGenerator, Iterable -from dataclasses import dataclass -from datetime import datetime -from typing import Any, Literal, Optional - -import numpy as np -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase - -from backend_request_func import ( - ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, - RequestFuncInput, - RequestFuncOutput, -) - -try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: - from backend_request_func import get_tokenizer - -try: - from vllm.utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - -from benchmark_dataset import ( - AIMODataset, - ASRDataset, - BurstGPTDataset, - ConversationDataset, - CustomDataset, - HuggingFaceDataset, - InstructCoderDataset, - MTBenchDataset, - NextEditPredictionDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json - -MILLISECONDS_TO_SECONDS_CONVERSION = 1000 - - -@dataclass -class BenchmarkMetrics: - completed: int - total_input: int - total_output: int - request_throughput: float - request_goodput: float - output_throughput: float - total_token_throughput: float - mean_ttft_ms: float - median_ttft_ms: float - std_ttft_ms: float - percentiles_ttft_ms: list[tuple[float, float]] - mean_tpot_ms: float - median_tpot_ms: float - std_tpot_ms: float - percentiles_tpot_ms: list[tuple[float, float]] - mean_itl_ms: float - median_itl_ms: float - std_itl_ms: float - percentiles_itl_ms: list[tuple[float, float]] - # E2EL stands for end-to-end latency per request. - # It is the time taken on the client side from sending - # a request to receiving a complete response. - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - - -def _get_current_request_rate( - ramp_up_strategy: Optional[Literal["linear", "exponential"]], - ramp_up_start_rps: Optional[int], - ramp_up_end_rps: Optional[int], - request_index: int, - total_requests: int, - request_rate: float, -) -> float: - if ( - ramp_up_strategy - and ramp_up_start_rps is not None - and ramp_up_end_rps is not None - ): - progress = request_index / max(total_requests - 1, 1) - if ramp_up_strategy == "linear": - increase = (ramp_up_end_rps - ramp_up_start_rps) * progress - return ramp_up_start_rps + increase - elif ramp_up_strategy == "exponential": - ratio = ramp_up_end_rps / ramp_up_start_rps - return ramp_up_start_rps * (ratio**progress) - else: - raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") - return request_rate - - -async def get_request( - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float = 1.0, - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, - ramp_up_start_rps: Optional[int] = None, - ramp_up_end_rps: Optional[int] = None, -) -> AsyncGenerator[tuple[SampleRequest, float], None]: - """ - Asynchronously generates requests at a specified rate - with OPTIONAL burstiness and OPTIONAL ramp-up strategy. - - Args: - input_requests: - A list of input requests, each represented as a SampleRequest. - request_rate: - The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. - Only takes effect when request_rate is not inf. - Default value is 1, which follows a Poisson process. - Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value - (burstiness > 1) results in a more uniform arrival of requests. - ramp_up_strategy (optional): - The ramp-up strategy. Can be "linear" or "exponential". - If None, uses constant request rate (specified by request_rate). - ramp_up_start_rps (optional): - The starting request rate for ramp-up. - ramp_up_end_rps (optional): - The ending request rate for ramp-up. - """ - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}." - ) - # Convert to list to get length for ramp-up calculations - if isinstance(input_requests, Iterable) and not isinstance(input_requests, list): - input_requests = list(input_requests) - - total_requests = len(input_requests) - request_index = 0 - - for request in input_requests: - current_request_rate = _get_current_request_rate( - ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - request_index, - total_requests, - request_rate, - ) - - yield request, current_request_rate - - request_index += 1 - - if current_request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - - theta = 1.0 / (current_request_rate * burstiness) - - # Sample the request interval from the gamma distribution. - # If burstiness is 1, it follows exponential distribution. - interval = np.random.gamma(shape=burstiness, scale=theta) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - -def calculate_metrics( - input_requests: list[SampleRequest], - outputs: list[RequestFuncOutput], - dur_s: float, - tokenizer: PreTrainedTokenizerBase, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - goodput_config_dict: dict[str, float], -) -> tuple[BenchmarkMetrics, list[int]]: - actual_output_lens: list[int] = [] - total_input = 0 - completed = 0 - good_completed = 0 - itls: list[float] = [] - tpots: list[float] = [] - all_tpots: list[float] = [] - ttfts: list[float] = [] - e2els: list[float] = [] - for i in range(len(outputs)): - if outputs[i].success: - output_len = outputs[i].output_tokens - - if not output_len: - # We use the tokenizer to count the number of output tokens - # for some serving backends instead of looking at - # len(outputs[i].itl) since multiple output tokens may be - # bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer( - outputs[i].generated_text, add_special_tokens=False - ).input_ids - ) - actual_output_lens.append(output_len) - total_input += input_requests[i].prompt_len - tpot = 0 - if output_len > 1: - latency_minus_ttft = outputs[i].latency - outputs[i].ttft - tpot = latency_minus_ttft / (output_len - 1) - tpots.append(tpot) - # Note: if output_len <= 1, we regard tpot as 0 for goodput - all_tpots.append(tpot) - itls += outputs[i].itl - ttfts.append(outputs[i].ttft) - e2els.append(outputs[i].latency) - completed += 1 - else: - actual_output_lens.append(0) - - if goodput_config_dict: - valid_metrics = [] - slo_values = [] - - if "ttft" in goodput_config_dict: - valid_metrics.append(ttfts) - slo_values.append( - goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "tpot" in goodput_config_dict: - valid_metrics.append(all_tpots) - slo_values.append( - goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - if "e2el" in goodput_config_dict: - valid_metrics.append(e2els) - slo_values.append( - goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION - ) - - for req_metric in zip(*valid_metrics): - is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) - if is_good_req: - good_completed += 1 - - if completed == 0: - warnings.warn( - "All requests failed. This is likely due to a misconfiguration " - "on the benchmark arguments.", - stacklevel=2, - ) - metrics = BenchmarkMetrics( - completed=completed, - total_input=total_input, - total_output=sum(actual_output_lens), - request_throughput=completed / dur_s, - request_goodput=good_completed / dur_s, - output_throughput=sum(actual_output_lens) / dur_s, - total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_ttft_ms=np.mean(ttfts or 0) - * 1000, # ttfts is empty if streaming is not supported by backend - std_ttft_ms=np.std(ttfts or 0) * 1000, - median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[ - (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles - ], - mean_tpot_ms=np.mean(tpots or 0) * 1000, - std_tpot_ms=np.std(tpots or 0) * 1000, - median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[ - (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles - ], - mean_itl_ms=np.mean(itls or 0) * 1000, - std_itl_ms=np.std(itls or 0) * 1000, - median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[ - (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles - ], - mean_e2el_ms=np.mean(e2els or 0) * 1000, - std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[ - (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles - ], - ) - - return metrics, actual_output_lens - - -async def benchmark( - backend: str, - api_url: str, - base_url: str, - model_id: str, - model_name: str, - tokenizer: PreTrainedTokenizerBase, - input_requests: list[SampleRequest], - logprobs: Optional[int], - request_rate: float, - burstiness: float, - disable_tqdm: bool, - profile: bool, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - ignore_eos: bool, - goodput_config_dict: dict[str, float], - max_concurrency: Optional[int], - lora_modules: Optional[Iterable[str]], - extra_body: Optional[dict], - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, - ramp_up_start_rps: Optional[int] = None, - ramp_up_end_rps: Optional[int] = None, -): - if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS[backend] - else: - raise ValueError(f"Unknown backend: {backend}") - - print("Starting initial single prompt test run...") - test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0].prompt, - input_requests[0].prompt_len, - input_requests[0].expected_output_len, - input_requests[0].multi_modal_data, - ) - - assert test_mm_content is None or isinstance(test_mm_content, dict) - test_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=api_url, - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - - test_output = await request_func(request_func_input=test_input) - if not test_output.success: - raise ValueError( - "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}" - ) - else: - print("Initial test run completed. Starting main benchmark run...") - - if lora_modules: - # For each input request, choose a LoRA module at random. - lora_modules = iter( - [random.choice(lora_modules) for _ in range(len(input_requests))] - ) - - if profile: - print("Starting profiler...") - profile_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=base_url + "/start_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - multi_modal_content=test_mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler started") - - distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" - - if ramp_up_strategy is not None: - print( - f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase " - f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over " - "the duration of the benchmark." - ) - else: - print(f"Traffic request rate: {request_rate} RPS.") - - print(f"Burstiness factor: {burstiness} ({distribution})") - print(f"Maximum request concurrency: {max_concurrency}") - - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None - - async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) - async with semaphore: - return await request_func(request_func_input=request_func_input, pbar=pbar) - - benchmark_start_time = time.perf_counter() - tasks: list[asyncio.Task] = [] - - rps_change_events = [] - last_int_rps = -1 - if ramp_up_strategy is not None and ramp_up_start_rps is not None: - last_int_rps = ramp_up_start_rps - rps_change_events.append( - { - "rps": last_int_rps, - "timestamp": datetime.now().isoformat(), - } - ) - - async for request, current_request_rate in get_request( - input_requests, - request_rate, - burstiness, - ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - ): - if ramp_up_strategy is not None: - current_int_rps = int(current_request_rate) - if current_int_rps > last_int_rps: - timestamp = datetime.now().isoformat() - for rps_val in range(last_int_rps + 1, current_int_rps + 1): - rps_change_events.append({"rps": rps_val, "timestamp": timestamp}) - last_int_rps = current_int_rps - - prompt, prompt_len, output_len, mm_content = ( - request.prompt, - request.prompt_len, - request.expected_output_len, - request.multi_modal_data, - ) - req_model_id, req_model_name = model_id, model_name - if lora_modules: - req_lora_module = next(lora_modules) - req_model_id, req_model_name = req_lora_module, req_lora_module - - request_func_input = RequestFuncInput( - model=req_model_id, - model_name=req_model_name, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - multi_modal_content=mm_content, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - task = limited_request_func(request_func_input=request_func_input, pbar=pbar) - tasks.append(asyncio.create_task(task)) - outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_prompt, - api_url=base_url + "/stop_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - - if pbar is not None: - pbar.close() - - benchmark_duration = time.perf_counter() - benchmark_start_time - - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - tokenizer=tokenizer, - selected_percentile_metrics=selected_percentile_metrics, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) - - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", metrics.request_throughput - ) - ) - if goodput_config_dict: - print( - "{:<40} {:<10.2f}".format( - "Request goodput (req/s):", metrics.request_goodput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput - ) - ) - - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "request_goodput": metrics.request_goodput if goodput_config_dict else None, - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "generated_texts": [output.generated_text for output in outputs], - "errors": [output.error for output in outputs], - } - - if rps_change_events: - result["rps_change_events"] = rps_change_events - - def process_one_metric( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"), - ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"), - ) - ) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms" - ) - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms" - ) - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms" - ) - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result[f"p{p_word}_{metric_attribute_name}_ms"] = value - - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("e2el", "E2EL", "End-to-end Latency") - - print("=" * 50) - - return result - - -def check_goodput_args(args): - # Check and parse goodput arguments - goodput_config_dict = {} - VALID_NAMES = ["ttft", "tpot", "e2el"] - if args.goodput: - goodput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in goodput_config_dict.items(): - if slo_name not in VALID_NAMES: - raise ValueError( - f"Invalid metric name found, {slo_name}: {slo_val}. " - "The service level objective name should be one of " - f"{str(VALID_NAMES)}. " - ) - if slo_val < 0: - raise ValueError( - f"Invalid value found, {slo_name}: {slo_val}. " - "The service level objective value should be " - "non-negative." - ) - return goodput_config_dict - - -def parse_goodput(slo_pairs): - goodput_config_dict = {} - try: - for slo_pair in slo_pairs: - slo_name, slo_val = slo_pair.split(":") - goodput_config_dict[slo_name] = float(slo_val) - except ValueError as err: - raise argparse.ArgumentTypeError( - "Invalid format found for service level objectives. " - 'Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is a " - "number in milliseconds." - ) from err - return goodput_config_dict - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any], file_name: str -) -> None: - metrics = [ - "median_ttft_ms", - "mean_ttft_ms", - "std_ttft_ms", - "p99_ttft_ms", - "mean_tpot_ms", - "median_tpot_ms", - "std_tpot_ms", - "p99_tpot_ms", - "median_itl_ms", - "mean_itl_ms", - "std_itl_ms", - "p99_itl_ms", - ] - # These raw data might be useful, but they are rather big. They can be added - # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={k: [results[k]] for k in metrics}, - extra_info={ - k: results[k] - for k in results - if k not in metrics and k not in ignored_metrics - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - backend = args.backend - model_id = args.model - model_name = args.served_model_name - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode - - # Validate ramp-up arguments - if args.ramp_up_strategy is not None: - if args.request_rate != float("inf"): - raise ValueError( - "When using ramp-up, do not specify --request-rate. " - "The request rate will be controlled by ramp-up parameters. " - "Please remove the --request-rate argument." - ) - if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: - raise ValueError( - "When using --ramp-up-strategy, both --ramp-up-start-rps and " - "--ramp-up-end-rps must be specified" - ) - if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: - raise ValueError("Ramp-up start and end RPS must be non-negative") - if args.ramp_up_start_rps > args.ramp_up_end_rps: - raise ValueError("Ramp-up start RPS must be less than end RPS") - if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0: - raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") - - if args.base_url is not None: - api_url = f"{args.base_url}{args.endpoint}" - base_url = f"{args.base_url}" - else: - api_url = f"http://{args.host}:{args.port}{args.endpoint}" - base_url = f"http://{args.host}:{args.port}" - - tokenizer = get_tokenizer( - tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=args.trust_remote_code, - ) - - if args.dataset_name is None: - raise ValueError( - "Please specify '--dataset-name' and the corresponding " - "'--dataset-path' if required." - ) - - if args.dataset_name == "custom": - dataset = CustomDataset(dataset_path=args.dataset_path) - input_requests = dataset.sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.custom_output_len, - skip_chat_template=args.custom_skip_chat_template, - ) - - elif args.dataset_name == "sonnet": - dataset = SonnetDataset(dataset_path=args.dataset_path) - # For the "sonnet" dataset, formatting depends on the backend. - if args.backend == "openai-chat": - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=False, - ) - else: - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - input_requests = dataset.sample( - num_requests=args.num_prompts, - input_len=args.sonnet_input_len, - output_len=args.sonnet_output_len, - prefix_len=args.sonnet_prefix_len, - tokenizer=tokenizer, - return_prompt_formatted=True, - ) - - elif args.dataset_name == "hf": - # all following datasets are implemented from the - # HuggingFaceDataset base class - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_class = VisionArenaDataset - args.hf_split = "train" - args.hf_subset = None - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_class = InstructCoderDataset - args.hf_split = "train" - elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: - dataset_class = MTBenchDataset - args.hf_split = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ConversationDataset - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_class = AIMODataset - args.hf_split = "train" - elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 - dataset_class = NextEditPredictionDataset - args.hf_split = "train" - elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: - dataset_class = ASRDataset - args.hf_split = "train" - else: - supported_datasets = set( - [ - dataset_name - for cls in HuggingFaceDataset.__subclasses__() - for dataset_name in cls.SUPPORTED_DATASET_PATHS - ] - ) - raise ValueError( - f"Unsupported dataset path: {args.dataset_path}. " - "Huggingface dataset only supports dataset_path" - f" from one of following: {supported_datasets}. " - "Please consider contributing if you would " - "like to add support for additional dataset formats." - ) - - if dataset_class.IS_MULTIMODAL and backend not in [ - "openai-chat", - "openai-audio", - ]: - # multi-modal benchmark is only available on OpenAI Chat backend. - raise ValueError( - "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' backend." - ) - input_requests = dataset_class( - dataset_path=args.dataset_path, - dataset_subset=args.hf_subset, - dataset_split=args.hf_split, - random_seed=args.seed, - no_stream=args.no_stream, - ).sample( - num_requests=args.num_prompts, - tokenizer=tokenizer, - output_len=args.hf_output_len, - ) - - else: - # For datasets that follow a similar structure, use a mapping. - dataset_mapping = { - "sharegpt": lambda: ShareGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, - ), - "burstgpt": lambda: BurstGPTDataset( - random_seed=args.seed, dataset_path=args.dataset_path - ).sample(tokenizer=tokenizer, num_requests=args.num_prompts), - "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( - tokenizer=tokenizer, - num_requests=args.num_prompts, - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - range_ratio=args.random_range_ratio, - ), - } - - try: - input_requests = dataset_mapping[args.dataset_name]() - except KeyError as err: - raise ValueError(f"Unknown dataset: {args.dataset_name}") from err - goodput_config_dict = check_goodput_args(args) - - # Collect the sampling parameters. - sampling_params = { - k: v - for k, v in { - "top_p": args.top_p, - "top_k": args.top_k, - "min_p": args.min_p, - "temperature": args.temperature, - }.items() - if v is not None - } - - # Sampling parameters are only supported by openai-compatible backend. - if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: - raise ValueError( - "Sampling parameters are only supported by openai-compatible backends." - ) - - if "temperature" not in sampling_params: - sampling_params["temperature"] = 0.0 # Default to greedy decoding. - - if args.backend == "llama.cpp": - # Disable prompt caching in llama.cpp backend - sampling_params["cache_prompt"] = False - - # Avoid GC processing "static" data - reduce pause times. - gc.collect() - gc.freeze() - - benchmark_result = asyncio.run( - benchmark( - backend=backend, - api_url=api_url, - base_url=base_url, - model_id=model_id, - model_name=model_name, - tokenizer=tokenizer, - input_requests=input_requests, - logprobs=args.logprobs, - request_rate=args.request_rate, - burstiness=args.burstiness, - disable_tqdm=args.disable_tqdm, - profile=args.profile, - selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], - ignore_eos=args.ignore_eos, - goodput_config_dict=goodput_config_dict, - max_concurrency=args.max_concurrency, - lora_modules=args.lora_modules, - extra_body=sampling_params, - ramp_up_strategy=args.ramp_up_strategy, - ramp_up_start_rps=args.ramp_up_start_rps, - ramp_up_end_rps=args.ramp_up_end_rps, - ) - ) - - # Save config and results to json - if args.save_result or args.append_result: - result_json: dict[str, Any] = {} - - # Setup - current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_json["date"] = current_dt - result_json["backend"] = backend - result_json["model_id"] = model_id - result_json["tokenizer_id"] = tokenizer_id - result_json["num_prompts"] = args.num_prompts - - # Metadata - if args.metadata: - for item in args.metadata: - if "=" in item: - kvstring = item.split("=") - result_json[kvstring[0].strip()] = kvstring[1].strip() - else: - raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) - # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf" - ) - result_json["burstiness"] = args.burstiness - result_json["max_concurrency"] = args.max_concurrency - - if args.ramp_up_strategy is not None: - result_json["ramp_up_strategy"] = args.ramp_up_strategy - result_json["ramp_up_start_rps"] = args.ramp_up_start_rps - result_json["ramp_up_end_rps"] = args.ramp_up_end_rps - - # Merge with benchmark result - result_json = {**result_json, **benchmark_result} - - if not args.save_detailed: - # Remove fields with too many data points - for field in [ - "input_lens", - "output_lens", - "ttfts", - "itls", - "generated_texts", - "errors", - ]: - if field in result_json: - del result_json[field] - if field in benchmark_result: - del benchmark_result[field] - - # Save to file - base_model_id = model_id.split("/")[-1] - max_concurrency_str = ( - f"-concurrency{args.max_concurrency}" - if args.max_concurrency is not None - else "" - ) - if args.ramp_up_strategy is not None: - file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - else: - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - if args.result_filename: - file_name = args.result_filename - if args.result_dir: - os.makedirs(args.result_dir, exist_ok=True) - file_name = os.path.join(args.result_dir, file_name) - with open( - file_name, mode="a+" if args.append_result else "w", encoding="utf-8" - ) as outfile: - # Append a newline. - if args.append_result and outfile.tell() != 0: - outfile.write("\n") - json.dump(result_json, outfile) - save_to_pytorch_benchmark_format(args, result_json, file_name) - - -def create_argument_parser(): - parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput." - ) - parser.add_argument( - "--backend", - type=str, - default="vllm", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) - parser.add_argument( - "--base-url", - type=str, - default=None, - help="Server or API base url if not using http host and port.", - ) - # Use 127.0.0.1 here instead of localhost to force the use of ipv4 - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument( - "--endpoint", - type=str, - default="/v1/completions", - help="API endpoint.", - ) - parser.add_argument( - "--dataset-name", - type=str, - default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], - help="Name of the dataset to benchmark on.", - ) - parser.add_argument( - "--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.", - ) - parser.add_argument( - "--no-stream", - action="store_true", - help="Do not load the dataset in streaming mode.", - ) - parser.add_argument( - "--max-concurrency", - type=int, - default=None, - help="Maximum number of concurrent requests. This can be used " - "to help simulate an environment where a higher level component " - "is enforcing a maximum number of concurrent requests. While the " - "--request-rate argument controls the rate at which requests are " - "initiated, this argument will control how many are actually allowed " - "to execute at a time. This means that when used in combination, the " - "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.", - ) - - parser.add_argument( - "--model", - type=str, - required=True, - help="Name of the model.", - ) - parser.add_argument( - "--tokenizer", - type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.", - ) - parser.add_argument( - "--logprobs", - type=int, - default=None, - help=( - "Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed" - ), - ) - parser.add_argument( - "--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process or gamma distribution " - "to synthesize the request arrival times.", - ) - parser.add_argument( - "--burstiness", - type=float, - default=1.0, - help="Burstiness factor of the request generation. " - "Only take effect when request_rate is not inf. " - "Default value is 1, which follows Poisson process. " - "Otherwise, the request intervals follow a gamma distribution. " - "A lower burstiness value (0 < burstiness < 1) results in more " - "bursty requests. A higher burstiness value (burstiness > 1) " - "results in a more uniform arrival of requests.", - ) - parser.add_argument("--seed", type=int, default=0) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) - parser.add_argument( - "--disable-tqdm", - action="store_true", - help="Specify to disable tqdm progress bar.", - ) - parser.add_argument( - "--profile", - action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", - ) - parser.add_argument( - "--save-result", - action="store_true", - help="Specify to save benchmark results to a json file", - ) - parser.add_argument( - "--save-detailed", - action="store_true", - help="When saving the results, whether to include per request " - "information such as response, error, ttfs, tpots, etc.", - ) - parser.add_argument( - "--append-result", - action="store_true", - help="Append the benchmark result to the existing json file.", - ) - parser.add_argument( - "--metadata", - metavar="KEY=VALUE", - nargs="*", - help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " - "for metadata of this run to be saved in the result JSON file " - "for record keeping purposes.", - ) - parser.add_argument( - "--result-dir", - type=str, - default=None, - help="Specify directory to save benchmark json results." - "If not specified, results are saved in the current directory.", - ) - parser.add_argument( - "--result-filename", - type=str, - default=None, - help="Specify the filename to save benchmark json results." - "If not specified, results will be saved in " - "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" - " format.", - ) - parser.add_argument( - "--ignore-eos", - action="store_true", - help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", - ) - parser.add_argument( - "--percentile-metrics", - type=str, - default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " - "This argument specifies the metrics to report percentiles. " - 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' - 'Default value is "ttft,tpot,itl".', - ) - parser.add_argument( - "--metric-percentiles", - type=str, - default="99", - help="Comma-separated list of percentiles for selected metrics. " - 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' - 'Default value is "99". ' - 'Use "--percentile-metrics" to select metrics.', - ) - parser.add_argument( - "--goodput", - nargs="+", - required=False, - help='Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is in " - 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' - "separated by spaces. Allowed request level metric names are " - '"ttft", "tpot", "e2el". For more context on the definition of ' - "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve", - ) - - # group for dataset specific arguments - custom_group = parser.add_argument_group("custom dataset options") - custom_group.add_argument( - "--custom-output-len", - type=int, - default=256, - help="Number of output tokens per request, used only for custom dataset.", - ) - custom_group.add_argument( - "--custom-skip-chat-template", - action="store_true", - help="Skip applying chat template to prompt, used only for custom dataset.", - ) - - sonnet_group = parser.add_argument_group("sonnet dataset options") - sonnet_group.add_argument( - "--sonnet-input-len", - type=int, - default=550, - help="Number of input tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-output-len", - type=int, - default=150, - help="Number of output tokens per request, used only for sonnet dataset.", - ) - sonnet_group.add_argument( - "--sonnet-prefix-len", - type=int, - default=200, - help="Number of prefix tokens per request, used only for sonnet dataset.", - ) - - sharegpt_group = parser.add_argument_group("sharegpt dataset options") - sharegpt_group.add_argument( - "--sharegpt-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.", - ) - - random_group = parser.add_argument_group("random dataset options") - random_group.add_argument( - "--random-input-len", - type=int, - default=1024, - help="Number of input tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-output-len", - type=int, - default=128, - help="Number of output tokens per request, used only for random sampling.", - ) - random_group.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for random sampling. Must be in the range [0, 1) to define " - "a symmetric sampling range" - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - random_group.add_argument( - "--random-prefix-len", - type=int, - default=0, - help=( - "Number of fixed prefix tokens before the random context " - "in a request. " - "The total input length is the sum of `random-prefix-len` and " - "a random " - "context length sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]." - ), - ) - - hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - hf_group.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - hf_group.add_argument( - "--hf-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", - ) - - sampling_group = parser.add_argument_group("sampling parameters") - sampling_group.add_argument( - "--top-p", - type=float, - default=None, - help="Top-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--top-k", - type=int, - default=None, - help="Top-k sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--min-p", - type=float, - default=None, - help="Min-p sampling parameter. Only has effect on openai-compatible backends.", - ) - sampling_group.add_argument( - "--temperature", - type=float, - default=None, - help="Temperature sampling parameter. Only has effect on " - "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).", - ) - - parser.add_argument( - "--tokenizer-mode", - type=str, - default="auto", - choices=["auto", "slow", "mistral", "custom"], - help='The tokenizer mode.\n\n* "auto" will use the ' - 'fast tokenizer if available.\n* "slow" will ' - "always use the slow tokenizer. \n* " - '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.', - ) - - parser.add_argument( - "--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ", - ) - - parser.add_argument( - "--lora-modules", - nargs="+", - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.", - ) - - parser.add_argument( - "--ramp-up-strategy", - type=str, - default=None, - choices=["linear", "exponential"], - help="The ramp-up strategy. This would be used to " - "ramp up the request rate from initial RPS to final " - "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). " - "over the duration of the benchmark.", - ) - parser.add_argument( - "--ramp-up-start-rps", - type=int, - default=None, - help="The starting request rate for ramp-up (RPS). " - "Needs to be specified when --ramp-up-strategy is used.", - ) - parser.add_argument( - "--ramp-up-end-rps", - type=int, - default=None, - help="The ending request rate for ramp-up (RPS). " - "Needs to be specified when --ramp-up-strategy is used.", - ) +if __name__ == "__main__": + print("""DEPRECATED: This script has been moved to the vLLM CLI. - return parser +Please use the following command instead: + vllm bench serve +For help with the new command, run: + vllm bench serve --help -if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - main(args) +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench serve --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index e23a5a9e2233..539ab2ed0a4d 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -4,7 +4,7 @@ On the server side, run one of the following commands: (vLLM OpenAI API server) - vllm serve --disable-log-requests + vllm serve On the client side, run: python benchmarks/benchmark_serving_structured_output.py \ @@ -31,20 +31,19 @@ import uuid import warnings from collections.abc import AsyncGenerator +from contextlib import nullcontext from dataclasses import dataclass -from typing import Optional import datasets import numpy as np import pandas as pd -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase - from backend_request_func import ( ASYNC_REQUEST_FUNCS, RequestFuncInput, RequestFuncOutput, ) +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase try: from vllm.transformers_utils.tokenizer import get_tokenizer @@ -317,7 +316,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: list[str], selected_percentiles: list[float], - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ) -> tuple[BenchmarkMetrics, list[int]]: actual_output_lens: list[int] = [] total_input = 0 @@ -437,9 +436,9 @@ async def benchmark( selected_percentile_metrics: list[str], selected_percentiles: list[str], ignore_eos: bool, - max_concurrency: Optional[int], + max_concurrency: int | None, structured_output_ratio: float, - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -449,7 +448,8 @@ async def benchmark( def prepare_extra_body(request) -> dict: extra_body = {} # Add the schema to the extra_body - extra_body[request.structure_type] = request.schema + extra_body["structured_outputs"] = {} + extra_body["structured_outputs"][request.structure_type] = request.schema return extra_body print("Starting initial single prompt test run...") @@ -502,15 +502,9 @@ def prepare_extra_body(request) -> dict: pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext() async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) async with semaphore: return await request_func(request_func_input=request_func_input, pbar=pbar) @@ -538,20 +532,6 @@ async def limited_request_func(request_func_input, pbar): ) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_request.prompt, - api_url=base_url + "/stop_profile", - prompt_len=test_request.prompt_len, - output_len=test_request.expected_output_len, - extra_body={test_request.structure_type: test_request.schema}, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - if pbar is not None: pbar.close() @@ -569,6 +549,10 @@ async def limited_request_func(request_func_input, pbar): print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) + if request_rate != float("inf"): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) @@ -666,6 +650,20 @@ def process_one_metric( print("=" * 50) + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + extra_body={test_request.structure_type: test_request.schema}, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + return result, ret @@ -692,11 +690,11 @@ def _eval_correctness_regex(expected, actual): return re.match(args.regex, actual) is not None def _eval_correctness(expected, actual): - if args.structure_type == "guided_json": + if args.structure_type == "json": return _eval_correctness_json(expected, actual) - elif args.structure_type == "guided_regex": + elif args.structure_type == "regex": return _eval_correctness_regex(expected, actual) - elif args.structure_type == "guided_choice": + elif args.structure_type == "choice": return _eval_correctness_choice(expected, actual) else: return None @@ -776,18 +774,18 @@ def main(args: argparse.Namespace): ) if args.dataset == "grammar": - args.structure_type = "guided_grammar" + args.structure_type = "grammar" elif args.dataset == "regex": - args.structure_type = "guided_regex" + args.structure_type = "regex" elif args.dataset == "choice": - args.structure_type = "guided_choice" + args.structure_type = "choice" else: - args.structure_type = "guided_json" + args.structure_type = "json" if args.no_structured_output: args.structured_output_ratio = 0 if args.save_results: - result_file_name = f"{args.structured_output_ratio}guided" + result_file_name = f"{args.structured_output_ratio}so" result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" @@ -905,13 +903,13 @@ def create_argument_parser(): parser.add_argument( "--tokenizer", type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", ) parser.add_argument( "--tokenizer-mode", type=str, default="auto", - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", ) parser.add_argument( "--num-prompts", @@ -994,7 +992,7 @@ def create_argument_parser(): "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentiles. " "This argument specifies the metrics to report percentiles. " 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'Default value is "ttft,tpot,itl".', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 14461121fece..b6dc0918fd4d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,736 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Benchmark offline inference throughput.""" +import sys -import argparse -import dataclasses -import json -import os -import random -import time -import warnings -from typing import Any, Optional, Union - -import torch -import uvloop -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase - -from benchmark_dataset import ( - AIMODataset, - BurstGPTDataset, - ConversationDataset, - InstructCoderDataset, - RandomDataset, - SampleRequest, - ShareGPTDataset, - SonnetDataset, - VisionArenaDataset, -) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args, -) -from vllm.inputs import TextPrompt, TokensPrompt -from vllm.lora.request import LoRARequest -from vllm.outputs import RequestOutput -from vllm.sampling_params import BeamSearchParams -from vllm.utils import FlexibleArgumentParser, merge_async_iterators - - -def run_vllm( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, Optional[list[RequestOutput]]]: - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests: Optional[list[LoRARequest]] = None - if engine_args.enable_lora: - lora_requests = [request.lora_request for request in requests] - - use_beam_search = False - - outputs = None - if not use_beam_search: - start = time.perf_counter() - outputs = llm.generate( - prompts, sampling_params, lora_request=lora_requests, use_tqdm=True - ) - end = time.perf_counter() - else: - assert lora_requests is None, "BeamSearch API does not support LoRA" - prompts = [request.prompt for request in requests] - # output_len should be the same for all requests. - output_len = requests[0].expected_output_len - for request in requests: - assert request.expected_output_len == output_len - start = time.perf_counter() - llm.beam_search( - prompts, - BeamSearchParams( - beam_width=n, - max_tokens=output_len, - ignore_eos=True, - ), - ) - end = time.perf_counter() - return end - start, outputs - - -def run_vllm_chat( - requests: list[SampleRequest], - n: int, - engine_args: EngineArgs, - disable_detokenize: bool = False, -) -> tuple[float, list[RequestOutput]]: - """ - Run vLLM chat benchmark. This function is recommended ONLY for benchmarking - multimodal models as it properly handles multimodal inputs and chat - formatting. For non-multimodal models, use run_vllm() instead. - """ - from vllm import LLM, SamplingParams - - llm = LLM(**dataclasses.asdict(engine_args)) - - assert all( - llm.llm_engine.model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of " - "prompt_len and expected_output_len for all requests." - ) - - prompts = [] - sampling_params: list[SamplingParams] = [] - for request in requests: - prompts.append(request.prompt) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - start = time.perf_counter() - outputs = llm.chat(prompts, sampling_params, use_tqdm=True) - end = time.perf_counter() - return end - start, outputs - - -async def run_vllm_async( - requests: list[SampleRequest], - n: int, - engine_args: AsyncEngineArgs, - disable_frontend_multiprocessing: bool = False, - disable_detokenize: bool = False, -) -> float: - from vllm import SamplingParams - - async with build_async_engine_client_from_engine_args( - engine_args, disable_frontend_multiprocessing - ) as llm: - model_config = await llm.get_model_config() - assert all( - model_config.max_model_len - >= (request.prompt_len + request.expected_output_len) - for request in requests - ), ( - "Please ensure that max_model_len is greater than the sum of" - " prompt_len and expected_output_len for all requests." - ) - - # Add the requests to the engine. - prompts: list[Union[TextPrompt, TokensPrompt]] = [] - sampling_params: list[SamplingParams] = [] - lora_requests: list[Optional[LoRARequest]] = [] - for request in requests: - prompts.append( - TokensPrompt( - prompt_token_ids=request.prompt["prompt_token_ids"], - multi_modal_data=request.multi_modal_data, - ) - if "prompt_token_ids" in request.prompt - else TextPrompt( - prompt=request.prompt, multi_modal_data=request.multi_modal_data - ) - ) - sampling_params.append( - SamplingParams( - n=n, - temperature=1.0, - top_p=1.0, - ignore_eos=True, - max_tokens=request.expected_output_len, - detokenize=not disable_detokenize, - ) - ) - lora_requests.append(request.lora_request) - - generators = [] - start = time.perf_counter() - for i, (prompt, sp, lr) in enumerate( - zip(prompts, sampling_params, lora_requests) - ): - generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") - generators.append(generator) - all_gens = merge_async_iterators(*generators) - async for i, res in all_gens: - pass - end = time.perf_counter() - return end - start - - -def run_hf( - requests: list[SampleRequest], - model: str, - tokenizer: PreTrainedTokenizerBase, - n: int, - max_batch_size: int, - trust_remote_code: bool, - disable_detokenize: bool = False, -) -> float: - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code - ) - if llm.config.model_type == "llama": - # To enable padding in the HF backend. - tokenizer.pad_token = tokenizer.eos_token - llm = llm.cuda() - - pbar = tqdm(total=len(requests)) - start = time.perf_counter() - batch: list[str] = [] - max_prompt_len = 0 - max_output_len = 0 - for i in range(len(requests)): - prompt = requests[i].prompt - prompt_len = requests[i].prompt_len - output_len = requests[i].expected_output_len - # Add the prompt to the batch. - batch.append(prompt) - max_prompt_len = max(max_prompt_len, prompt_len) - max_output_len = max(max_output_len, output_len) - if len(batch) < max_batch_size and i != len(requests) - 1: - # Check if we can add more requests to the batch. - next_prompt_len = requests[i + 1].prompt_len - next_output_len = requests[i + 1].expected_output_len - if ( - max(max_prompt_len, next_prompt_len) - + max(max_output_len, next_output_len) - ) <= 2048: - # We can add more requests to the batch. - continue - - # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids - llm_outputs = llm.generate( - input_ids=input_ids.cuda(), - do_sample=True, - num_return_sequences=n, - temperature=1.0, - top_p=1.0, - use_cache=True, - max_new_tokens=max_output_len, - ) - if not disable_detokenize: - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) - pbar.update(len(batch)) - - # Clear the batch. - batch = [] - max_prompt_len = 0 - max_output_len = 0 - end = time.perf_counter() - return end - start - - -def run_mii( - requests: list[SampleRequest], - model: str, - tensor_parallel_size: int, - output_len: int, -) -> float: - from mii import client, serve - - llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [request.prompt for request in requests] - - start = time.perf_counter() - llm.generate(prompts, max_new_tokens=output_len) - end = time.perf_counter() - client = client(model) - client.terminate_server() - return end - start - - -def save_to_pytorch_benchmark_format( - args: argparse.Namespace, results: dict[str, Any] -) -> None: - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={ - "requests_per_second": [results["requests_per_second"]], - "tokens_per_second": [results["tokens_per_second"]], - }, - extra_info={ - k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] - }, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def get_requests(args, tokenizer): - # Common parameters for all dataset types. - common_kwargs = { - "dataset_path": args.dataset_path, - "random_seed": args.seed, - } - sample_kwargs = { - "tokenizer": tokenizer, - "lora_path": args.lora_path, - "max_loras": args.max_loras, - "num_requests": args.num_prompts, - "input_len": args.input_len, - "output_len": args.output_len, - } - - if args.dataset_path is None or args.dataset_name == "random": - sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["prefix_len"] = args.prefix_len - dataset_cls = RandomDataset - elif args.dataset_name == "sharegpt": - dataset_cls = ShareGPTDataset - if args.backend == "vllm-chat": - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_name == "sonnet": - assert tokenizer.chat_template or tokenizer.default_chat_template, ( - "Tokenizer/model must have chat template for sonnet dataset." - ) - dataset_cls = SonnetDataset - sample_kwargs["prefix_len"] = args.prefix_len - sample_kwargs["return_prompt_formatted"] = True - elif args.dataset_name == "burstgpt": - dataset_cls = BurstGPTDataset - elif args.dataset_name == "hf": - common_kwargs["no_stream"] = args.no_stream - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = VisionArenaDataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = InstructCoderDataset - common_kwargs["dataset_split"] = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: - dataset_cls = ConversationDataset - common_kwargs["dataset_subset"] = args.hf_subset - common_kwargs["dataset_split"] = args.hf_split - sample_kwargs["enable_multimodal_chat"] = True - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: - dataset_cls = AIMODataset - common_kwargs["dataset_subset"] = None - common_kwargs["dataset_split"] = "train" - else: - raise ValueError(f"Unknown dataset name: {args.dataset_name}") - # Remove None values - sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} - return dataset_cls(**common_kwargs).sample(**sample_kwargs) - - -def main(args: argparse.Namespace): - if args.seed is None: - args.seed = 0 - print(args) - random.seed(args.seed) - # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code - ) - requests = get_requests(args, tokenizer) - is_multi_modal = any(request.multi_modal_data is not None for request in requests) - request_outputs: Optional[list[RequestOutput]] = None - if args.backend == "vllm": - if args.async_engine: - elapsed_time = uvloop.run( - run_vllm_async( - requests, - args.n, - AsyncEngineArgs.from_cli_args(args), - args.disable_frontend_multiprocessing, - args.disable_detokenize, - ) - ) - else: - elapsed_time, request_outputs = run_vllm( - requests, - args.n, - EngineArgs.from_cli_args(args), - args.disable_detokenize, - ) - elif args.backend == "hf": - assert args.tensor_parallel_size == 1 - elapsed_time = run_hf( - requests, - args.model, - tokenizer, - args.n, - args.hf_max_batch_size, - args.trust_remote_code, - args.disable_detokenize, - ) - elif args.backend == "mii": - elapsed_time = run_mii( - requests, args.model, args.tensor_parallel_size, args.output_len - ) - elif args.backend == "vllm-chat": - elapsed_time, request_outputs = run_vllm_chat( - requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize - ) - else: - raise ValueError(f"Unknown backend: {args.backend}") - - if request_outputs: - # Note: with the vllm and vllm-chat backends, - # we have request_outputs, which we use to count tokens. - total_prompt_tokens = 0 - total_output_tokens = 0 - for ro in request_outputs: - if not isinstance(ro, RequestOutput): - continue - total_prompt_tokens += ( - len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 - ) - total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) - total_num_tokens = total_prompt_tokens + total_output_tokens - else: - total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) - total_output_tokens = sum(r.expected_output_len for r in requests) - total_prompt_tokens = total_num_tokens - total_output_tokens - - if is_multi_modal and args.backend != "vllm-chat": - print( - "\033[91mWARNING\033[0m: Multi-modal request with " - f"{args.backend} backend detected. The " - "following metrics are not accurate because image tokens are not" - " counted. See vllm-project/vllm/issues/9778 for details." - ) - # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. - # vllm-chat backend counts the image tokens now - - print( - f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " - f"{total_output_tokens / elapsed_time:.2f} output tokens/s" - ) - print(f"Total num prompt tokens: {total_prompt_tokens}") - print(f"Total num output tokens: {total_output_tokens}") - - # Output JSON results if specified - if args.output_json: - results = { - "elapsed_time": elapsed_time, - "num_requests": len(requests), - "total_num_tokens": total_num_tokens, - "requests_per_second": len(requests) / elapsed_time, - "tokens_per_second": total_num_tokens / elapsed_time, - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - save_to_pytorch_benchmark_format(args, results) - - -def validate_args(args): - """ - Validate command-line arguments. - """ - - # === Deprecation and Defaulting === - if args.dataset is not None: - warnings.warn( - "The '--dataset' argument will be deprecated in the next release. " - "Please use '--dataset-name' and '--dataset-path' instead.", - stacklevel=2, - ) - args.dataset_path = args.dataset - - if not getattr(args, "tokenizer", None): - args.tokenizer = args.model - - # === Backend Validation === - valid_backends = {"vllm", "hf", "mii", "vllm-chat"} - if args.backend not in valid_backends: - raise ValueError(f"Unsupported backend: {args.backend}") - - # === Dataset Configuration === - if not args.dataset and not args.dataset_path: - print("When dataset path is not set, it will default to random dataset") - args.dataset_name = "random" - if args.input_len is None: - raise ValueError("input_len must be provided for a random dataset") - - # === Dataset Name Specific Checks === - # --hf-subset and --hf-split: only used - # when dataset_name is 'hf' - if args.dataset_name != "hf" and ( - getattr(args, "hf_subset", None) is not None - or getattr(args, "hf_split", None) is not None - ): - warnings.warn( - "--hf-subset and --hf-split will be ignored \ - since --dataset-name is not 'hf'.", - stacklevel=2, - ) - elif args.dataset_name == "hf": - if args.dataset_path in ( - VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() - | ConversationDataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm-chat", ( - f"{args.dataset_path} needs to use vllm-chat as the backend." - ) # noqa: E501 - elif args.dataset_path in ( - InstructCoderDataset.SUPPORTED_DATASET_PATHS - | AIMODataset.SUPPORTED_DATASET_PATHS - ): - assert args.backend == "vllm", ( - f"{args.dataset_path} needs to use vllm as the backend." - ) # noqa: E501 - else: - raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") - - # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != "random" and args.random_range_ratio is not None: - warnings.warn( - "--random-range-ratio will be ignored since \ - --dataset-name is not 'random'.", - stacklevel=2, - ) - - # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not - # set. - if ( - args.dataset_name not in {"random", "sonnet", None} - and args.prefix_len is not None - ): - warnings.warn( - "--prefix-len will be ignored since --dataset-name\ - is not 'random', 'sonnet', or not set.", - stacklevel=2, - ) - - # === LoRA Settings === - if getattr(args, "enable_lora", False) and args.backend != "vllm": - raise ValueError("LoRA benchmarking is only supported for vLLM backend") - if getattr(args, "enable_lora", False) and args.lora_path is None: - raise ValueError("LoRA path must be provided when enable_lora is True") - - # === Backend-specific Validations === - if args.backend == "hf" and args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend") - if args.backend != "hf" and args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - - if ( - args.backend in {"hf", "mii"} - and getattr(args, "quantization", None) is not None - ): - raise ValueError("Quantization is only for vLLM backend.") - - if args.backend == "mii" and args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.backend == "mii" and args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.backend == "mii" and args.tokenizer != args.model: - raise ValueError("Tokenizer must be the same as the model for MII backend.") - - # --data-parallel is not supported currently. - # https://github.com/vllm-project/vllm/issues/16222 - if args.data_parallel_size > 1: - raise ValueError( - "Data parallel is not supported in offline benchmark, \ - please use benchmark serving instead" - ) - - -def create_argument_parser(): - parser = FlexibleArgumentParser(description="Benchmark the throughput.") - parser.add_argument( - "--backend", - type=str, - choices=["vllm", "hf", "mii", "vllm-chat"], - default="vllm", - ) - parser.add_argument( - "--dataset-name", - type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], - help="Name of the dataset to benchmark on.", - default="sharegpt", - ) - parser.add_argument( - "--no-stream", - action="store_true", - help="Do not load the dataset in streaming mode.", - ) - parser.add_argument( - "--dataset", - type=str, - default=None, - help="Path to the ShareGPT dataset, will be deprecated in\ - the next release. The dataset is expected to " - "be a json in form of list[dict[..., conversations: " - "list[dict[..., value: ]]]]", - ) - parser.add_argument( - "--dataset-path", type=str, default=None, help="Path to the dataset" - ) - parser.add_argument( - "--input-len", - type=int, - default=None, - help="Input prompt length for each request", - ) - parser.add_argument( - "--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.", - ) - parser.add_argument( - "--n", type=int, default=1, help="Number of generated sequences per prompt." - ) - parser.add_argument( - "--num-prompts", type=int, default=1000, help="Number of prompts to process." - ) - parser.add_argument( - "--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the throughput results in JSON format.", - ) - parser.add_argument( - "--async-engine", - action="store_true", - default=False, - help="Use vLLM async engine rather than LLM class.", - ) - parser.add_argument( - "--disable-frontend-multiprocessing", - action="store_true", - default=False, - help="Disable decoupled async engine frontend.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=( - "Do not detokenize the response (i.e. do not include " - "detokenization time in the measurement)" - ), - ) - # LoRA - parser.add_argument( - "--lora-path", - type=str, - default=None, - help="Path to the LoRA adapters to use. This can be an absolute path, " - "a relative path, or a Hugging Face model identifier.", - ) - parser.add_argument( - "--prefix-len", - type=int, - default=None, - help=f"Number of prefix tokens to be used in RandomDataset " - "and SonnetDataset. For RandomDataset, the total input " - "length is the sum of prefix-len (default: " - f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length " - "sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]. For SonnetDataset, " - f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " - "controls how much of the input is fixed lines versus " - "random lines, but the total input length remains approximately " - "input_len tokens.", - ) - # random dataset - parser.add_argument( - "--random-range-ratio", - type=float, - default=None, - help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " - "for sampling input/output length, " - "used only for RandomDataset. Must be in the range [0, 1) to " - "define a symmetric sampling range " - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) - - # hf dtaset - parser.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - parser.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - - parser = AsyncEngineArgs.add_cli_args(parser) +if __name__ == "__main__": + print("""DEPRECATED: This script has been moved to the vLLM CLI. - return parser +Please use the following command instead: + vllm bench throughput +For help with the new command, run: + vllm bench throughput --help -if __name__ == "__main__": - parser = create_argument_parser() - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - validate_args(args) - main(args) +Alternatively, you can run the new command directly with: + python -m vllm.entrypoints.cli.main bench throughput --help +""") + sys.exit(1) diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 283f938df50a..f0d661f9d534 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import argparse import json import math import os +import time +from types import TracebackType from typing import Any @@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None: cls=InfEncoder, default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", ) + + +# Collect time and generate time metrics +# +# Example Usage: +# collector = TimeCollector(TimeCollector.US) +# for _ in range(total_iteration): +# with collector: +# ... +# collector.dump_avg_max() +class TimeCollector: + NS: int = 1 + US: int = NS * 1000 + MS: int = US * 1000 + S: int = MS * 1000 + + def __init__(self, scale: int) -> None: + self.cnt: int = 0 + self._sum: int = 0 + self._max: int | None = None + self.scale = scale + self.start_time: int = time.monotonic_ns() + + def collect(self, v: int) -> None: + self.cnt += 1 + self._sum += v + if self._max is None: + self._max = v + else: + self._max = max(self._max, v) + + def avg(self) -> float | str: + return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" + + def max(self) -> float | str: + return self._max / self.scale if self._max else "N/A" + + def dump_avg_max(self) -> list[float | str]: + return [self.avg(), self.max()] + + def __enter__(self) -> None: + self.start_time = time.monotonic_ns() + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + self.collect(time.monotonic_ns() - self.start_time) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 9ec270bbd2e9..22fc2678fd1c 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -6,8 +6,7 @@ import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index a5a5b52f6039..2deebf3ddb7a 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -6,8 +6,7 @@ import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable, Optional +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark @@ -17,7 +16,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - w8a8_block_fp8_matmul, + w8a8_triton_block_scaled_mm, ) from vllm.utils import FlexibleArgumentParser, cdiv @@ -53,7 +52,7 @@ def bench_int8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark INT8-based kernels.""" assert dtype == torch.int8 @@ -108,7 +107,7 @@ def bench_fp8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn @@ -158,7 +157,7 @@ def bench_fp8( "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) ), - "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( + "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm( a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) ), "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( @@ -183,7 +182,7 @@ def bench( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) @@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]): def run( dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]], - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 94999630bae1..d683835db96a 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -3,7 +3,7 @@ # benchmark the overhead of disaggregated prefill. # methodology: # - send all request to prefill vLLM instance. It will buffer KV cache. -# - then send all request to decode instance. +# - then send all request to decode instance. # - The TTFT of decode instance is the overhead. set -ex @@ -12,6 +12,8 @@ kill_gpu_processes() { # kill all processes on GPU. pgrep pt_main_thread | xargs -r kill -9 pgrep python3 | xargs -r kill -9 + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 sleep 10 # remove vllm config file @@ -53,61 +55,57 @@ benchmark() { output_len=$2 - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 # let the prefill instance finish prefill - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8100 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1.json \ - --request-rate "inf" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8100 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1.json \ + --request-rate "inf" # send the request to decode. # The TTFT of this command will be the overhead of disagg prefill impl. - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8200 \ - --save-result \ - --result-dir $results_folder \ - --result-filename disagg_prefill_tp1_overhead.json \ - --request-rate "$qps" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8200 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_tp1_overhead.json \ + --request-rate "$qps" kill_gpu_processes } diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index eb5d891d0d4a..35c86cc84522 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -18,6 +18,8 @@ kill_gpu_processes() { # kill all processes on GPU. pgrep pt_main_thread | xargs -r kill -9 pgrep python3 | xargs -r kill -9 + # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 + pgrep VLLM | xargs -r kill -9 for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done sleep 1 } @@ -36,16 +38,12 @@ wait_for_server() { launch_chunked_prefill() { model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --enable-chunked-prefill \ --gpu-memory-utilization 0.6 & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --enable-chunked-prefill \ @@ -58,25 +56,21 @@ launch_chunked_prefill() { launch_disagg_prefill() { - model="meta-llama/Meta-Llama-3.1-8B-Instruct" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 @@ -97,20 +91,20 @@ benchmark() { output_len=$2 tag=$3 - python3 ../benchmark_serving.py \ - --backend vllm \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --sonnet-input-len $input_len \ - --sonnet-output-len "$output_len" \ - --sonnet-prefix-len $prefix_len \ - --num-prompts $num_prompts \ - --port 8000 \ - --save-result \ - --result-dir $results_folder \ - --result-filename "$tag"-qps-"$qps".json \ - --request-rate "$qps" + vllm bench serve \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8000 \ + --save-result \ + --result-dir $results_folder \ + --result-filename "$tag"-qps-"$qps".json \ + --request-rate "$qps" sleep 2 } diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index f62d8102e2d9..904f80534914 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,63 +1,199 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import asyncio +import logging import os import aiohttp -from quart import Quart, make_response, request - -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) - -app = Quart(__name__) - - -async def forward_request(url, data): - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: +from quart import Quart, Response, make_response, request +from rate_limiter import RateLimiter +from request_queue import RequestQueue + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + """parse command line arguments""" + parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server") + + # Add args + parser.add_argument( + "--timeout", + type=float, + default=300, + help="Timeout for backend service requests in seconds (default: 300)", + ) + parser.add_argument( + "--max-concurrent", + type=int, + default=100, + help="Maximum concurrent requests to backend services (default: 100)", + ) + parser.add_argument( + "--queue-size", + type=int, + default=500, + help="Maximum number of requests in the queue (default: 500)", + ) + parser.add_argument( + "--rate-limit", + type=int, + default=40, + help="Maximum requests per second (default: 40)", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port to run the server on (default: 8000)", + ) + parser.add_argument( + "--prefill-url", + type=str, + default="http://localhost:8100/v1/completions", + help="Prefill service endpoint URL", + ) + parser.add_argument( + "--decode-url", + type=str, + default="http://localhost:8200/v1/completions", + help="Decode service endpoint URL", + ) + + return parser.parse_args() + + +def main(): + """parse command line arguments""" + args = parse_args() + + # Initialize configuration using command line parameters + AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) + MAX_CONCURRENT_REQUESTS = args.max_concurrent + REQUEST_QUEUE_SIZE = args.queue_size + RATE_LIMIT = args.rate_limit + PREFILL_SERVICE_URL = args.prefill_url + DECODE_SERVICE_URL = args.decode_url + PORT = args.port + + app = Quart(__name__) + + # Initialize the rate limiter and request queue + rate_limiter = RateLimiter(RATE_LIMIT) + request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) + + # Attach the configuration object to the application instance + app.config.update( + { + "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, + "rate_limiter": rate_limiter, + "request_queue": request_queue, + "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, + "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + } + ) + + # Start queue processing on app startup + @app.before_serving + async def startup(): + """Start request processing task when app starts serving""" + asyncio.create_task(request_queue.process()) + + async def forward_request(url, data): + """Forward request to backend service with rate limiting and error handling""" headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - async with session.post(url=url, json=data, headers=headers) as response: - if response.status == 200: - # if response.headers.get('Transfer-Encoding') == 'chunked': - if True: - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - content = await response.read() - yield content - - -@app.route("/v1/completions", methods=["POST"]) -async def handle_request(): - try: - original_request_data = await request.get_json() - - prefill_request = original_request_data.copy() - # change max_tokens = 1 to let it only do prefill - prefill_request["max_tokens"] = 1 - - # finish prefill - async for _ in forward_request( - "http://localhost:8100/v1/completions", prefill_request - ): - continue - # return decode - generator = forward_request( - "http://localhost:8200/v1/completions", original_request_data - ) - response = await make_response(generator) - response.timeout = None - - return response - - except Exception as e: - import sys - import traceback - - exc_info = sys.exc_info() - print("Error occurred in disagg prefill proxy server") - print(e) - print("".join(traceback.format_exception(*exc_info))) + # Use rate limiter as context manager + async with ( + rate_limiter, + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + ): + try: + async with session.post( + url=url, json=data, headers=headers + ) as response: + if response.status == 200: + # Stream response chunks + async for chunk_bytes in response.content.iter_chunked(1024): + yield chunk_bytes + else: + # Handle backend service errors + error_text = await response.text() + logger.error( + "Backend service error: %s - %s", + response.status, + error_text, + ) + yield b'{"error": "Backend service error"}' + except aiohttp.ClientError as e: + # Handle connection errors + logger.error("Connection error to %s: %s", url, str(e)) + yield b'{"error": "Service unavailable"}' + except asyncio.TimeoutError: + # Handle timeout errors + logger.error("Timeout connecting to %s", url) + yield b'{"error": "Service timeout"}' + + async def process_request(): + """Process a single request through prefill and decode stages""" + try: + original_request_data = await request.get_json() + + # Create prefill request (max_tokens=1) + prefill_request = original_request_data.copy() + prefill_request["max_tokens"] = 1 + + # Execute prefill stage + async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): + continue + + # Execute decode stage and stream response + generator = forward_request(DECODE_SERVICE_URL, original_request_data) + response = await make_response(generator) + response.timeout = None # Disable timeout for streaming response + return response + + except Exception: + logger.exception("Error processing request") + return Response( + response=b'{"error": "Internal server error"}', + status=500, + content_type="application/json", + ) + + @app.route("/v1/completions", methods=["POST"]) + async def handle_request(): + """Handle incoming API requests with concurrency and rate limiting""" + # Create task for request processing + task = asyncio.create_task(process_request()) + + # Enqueue request or reject if queue is full + if not await request_queue.enqueue(task): + return Response( + response=b'{"error": "Server busy, try again later"}', + status=503, + content_type="application/json", + ) + + try: + # Return the response from the processing task + return await task + except asyncio.CancelledError: + # Handle task cancellation (timeout or queue full) + logger.warning("Request cancelled due to timeout or queue full") + return Response( + response=b'{"error": "Request cancelled"}', + status=503, + content_type="application/json", + ) + + # Start the Quart server with host can be set to 0.0.0.0 + app.run(port=PORT) if __name__ == "__main__": - app.run(port=8000) + main() diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py new file mode 100644 index 000000000000..87ac8cb6ab1a --- /dev/null +++ b/benchmarks/disagg_benchmarks/rate_limiter.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +import time + + +class RateLimiter: + """Token bucket rate limiter implementation""" + + def __init__(self, rate_limit): + self.rate_limit = rate_limit # Requests per second + self.num_available_tokens = rate_limit # Available tokens + self.last_refill = time.monotonic() # Last token refill time + self.lock = asyncio.Lock() # Synchronization lock + + async def acquire(self): + """Acquire a token from the rate limiter""" + while True: + async with self.lock: + current_time = time.monotonic() + elapsed = current_time - self.last_refill + + # Refill num_available_tokens if more than 1 second has passed + if elapsed > 1.0: + self.num_available_tokens = self.rate_limit + self.last_refill = current_time + + # Check if num_available_tokens are available + if self.num_available_tokens > 0: + self.num_available_tokens -= 1 + return True + + # Calculate wait time if no num_available_tokens available + wait_time = 1.0 - elapsed + await asyncio.sleep(wait_time) + + async def __aenter__(self): + """Enter async context manager - acquire token""" + await self.acquire() + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + """Exit async context manager - no cleanup needed""" + pass diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py new file mode 100644 index 000000000000..410bcb956050 --- /dev/null +++ b/benchmarks/disagg_benchmarks/request_queue.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +from collections import deque + + +class RequestQueue: + """Request queue manager with concurrency control""" + + def __init__(self, max_concurrent, max_queue_size): + # Maximum concurrent requests + self.max_concurrent = max_concurrent + self.max_queue_size = max_queue_size # Maximum queue size + # Concurrency control + self.semaphore = asyncio.Semaphore(max_concurrent) + self.queue = deque() # Request queue + self.queue_size = 0 # Current queue size + self.lock = asyncio.Lock() # Sync queue Lock + + async def enqueue(self, task): + """Add a request task to the queue""" + async with self.lock: + if self.queue_size >= self.max_queue_size: + return False + + self.queue.append(task) + self.queue_size += 1 + return True + + async def process(self): + """Process queued requests using semaphore for concurrency control""" + while True: + if self.queue: + async with self.semaphore, self.lock: + task = self.queue.popleft() + self.queue_size -= 1 + await task + await asyncio.sleep(0.01) # Yield control to event loop diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 901524214469..d809bf1db8cb 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -3,10 +3,9 @@ import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]: def unfused_int8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -68,7 +67,7 @@ def unfused_int8_impl( def unfused_fp8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -85,7 +84,7 @@ def unfused_fp8_impl( def fused_impl( rms_norm_layer: RMSNorm, # this stores the weights x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): out, _ = ops.rms_norm_dynamic_per_token_quant( diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py new file mode 100644 index 000000000000..f1e504499eaf --- /dev/null +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear, +) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + CUTLASS_BLOCK_FP8_SUPPORTED, +) +from vllm.platforms import current_platform +from vllm.triton_utils import triton as vllm_triton + +assert current_platform.is_cuda(), ( + "Only support benchmarking w8a8 block fp8 kernel on CUDA device." +) + +# DeepSeek-V3 weight shapes +DEEPSEEK_V3_SHAPES = [ + (512 + 64, 7168), + (2112, 7168), + ((128 + 64) * 128, 7168), + (128 * (128 + 128), 512), + (7168, 16384), + (7168, 18432), + (18432 * 2, 7168), + (24576, 1536), + (12288, 7168), + (4096, 7168), + (7168, 2048), +] + + +def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): + """Build runner function for w8a8 block fp8 matmul.""" + factor_for_scale = 1e-2 + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + # Create random FP8 tensors + A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + + B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + # Create scales + block_n, block_k = block_size[0], block_size[1] + n_tiles = (N + block_n - 1) // block_n + k_tiles = (K + block_k - 1) // block_k + + Bs = ( + torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device) + * factor_for_scale + ) + + # SM90 CUTLASS requires row-major format for scales + if use_cutlass and current_platform.is_device_capability(90): + Bs = Bs.T.contiguous() + + def run(): + if use_cutlass: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True + ) + else: + return apply_w8a8_block_fp8_linear( + A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False + ) + + return run + + +# Determine available providers +available_providers = ["torch-bf16", "w8a8-block-fp8-triton"] +plot_title = "BF16 vs W8A8 Block FP8 GEMMs" + +if CUTLASS_BLOCK_FP8_SUPPORTED: + available_providers.append("w8a8-block-fp8-cutlass") + + +@vllm_triton.testing.perf_report( + vllm_triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=available_providers, + line_names=available_providers, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs W8A8 Block FP8 GEMMs", + args={}, + ) +) +def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)): + M = batch_size + device = "cuda" + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + a = torch.randn((M, K), device=device, dtype=torch.bfloat16) + b = torch.randn((N, K), device=device, dtype=torch.bfloat16) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + elif provider == "w8a8-block-fp8-triton": + run_w8a8_triton = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=False + ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_triton(), quantiles=quantiles + ) + elif provider == "w8a8-block-fp8-cutlass": + run_w8a8_cutlass = build_w8a8_block_fp8_runner( + M, N, K, block_size, device, use_cutlass=True + ) + ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph( + lambda: run_w8a8_cutlass(), quantiles=quantiles + ) + else: + raise ValueError(f"Unknown provider: {provider}") + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +if __name__ == "__main__": + block_size = (128, 128) + + for N, K in DEEPSEEK_V3_SHAPES: + print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}") + + print(f"TFLOP/s comparison (block_size={block_size}):") + benchmark_tflops.run( + print_data=True, + # show_plots=False, + # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}", + N=N, + K=K, + block_size=block_size, + ) + + print("\nBenchmark finished!") diff --git a/benchmarks/kernels/bench_mxfp4_qutlass.py b/benchmarks/kernels/bench_mxfp4_qutlass.py new file mode 100644 index 000000000000..dfc7721876a1 --- /dev/null +++ b/benchmarks/kernels/bench_mxfp4_qutlass.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at). +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import copy +import itertools + +import torch +from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn +from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "mxfp4": dict(no_a_quant=False, enabled=True), + "mxfp4-noquant": dict(no_a_quant=True, enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device): + return ( + deterministic_hadamard_matrix(group_size, dtype=dtype, device=device) + * group_size**-0.5 + ) + + +def _quant_weight_mxfp4( + b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str +): + weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx( + b, forward_hadamard_matrix, method="abs_max" + ) + weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton") + return weight_hf_e2m1, weight_hf_scale_block + + +def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device): + weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4( + b, forward_hadamard_matrix, device + ) + alpha = torch.tensor([1.0], device="cuda") + + if cfg["no_a_quant"]: + # Pre-quantize activation + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx( + a, forward_hadamard_matrix, method="abs_max" + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton") + + def run(): + return matmul_mxf4_bf16_tn( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + ) + + return run + + # Quantize activation on-the-fly + def run(): + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx( + a, forward_hadamard_matrix, method="abs_max" + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton") + return matmul_mxf4_bf16_tn( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + ) + + return run + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[ + 1, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 24576, + 32768, + ], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs MXFP4 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K, had_size): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_mxfp4_runner( + cfg, a, b, forward_hadamard_matrix, dtype, device + ) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), rep=200, quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.3-70B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + for had_size in [32, 64, 128]: + print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_mxfp4_res_n{N}_k{K}", + N=N, + K=K, + had_size=had_size, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_nvfp4_gemm.py b/benchmarks/kernels/bench_nvfp4_gemm.py index 9e832c9faa8e..6b19eb113f3e 100644 --- a/benchmarks/kernels/bench_nvfp4_gemm.py +++ b/benchmarks/kernels/bench_nvfp4_gemm.py @@ -3,6 +3,7 @@ import argparse import copy import itertools +import os import torch from weight_shapes import WEIGHT_SHAPES @@ -23,21 +24,45 @@ "torch-bf16": dict(enabled=True), "nvfp4": dict(no_a_quant=False, enabled=True), "nvfp4-noquant": dict(no_a_quant=True, enabled=True), + "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True), + "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True), } +_needs_fbgemm = any( + v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False) +) +if _needs_fbgemm: + try: + from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import ( + triton_scale_nvfp4_quant, + ) + except ImportError: + print( + "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. " + "These providers will be skipped. Please install fbgemm_gpu with: " + "'pip install fbgemm-gpu-genai' to run them." + ) + # Disable FBGEMM providers so the benchmark can run. + for cfg in PROVIDER_CFGS.values(): + if cfg.get("fbgemm"): + cfg["enabled"] = False + _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] -def _quant_weight_nvfp4(b: torch.Tensor, device: str): +def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg): # Compute global scale for weight b_amax = torch.abs(b).max().to(torch.float32) b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax - b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale) + if "fbgemm" in cfg and cfg["fbgemm"]: + b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale) + else: + b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale) return b_fp4, scale_b_fp4, b_global_scale def build_nvfp4_runner(cfg, a, b, dtype, device): - b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device) + b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg) # Compute global scale for activation # NOTE: This is generally provided ahead-of-time by the model checkpoint. @@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device): # Alpha for the GEMM operation alpha = 1.0 / (a_global_scale * b_global_scale) + if "fbgemm" in cfg and cfg["fbgemm"]: + if cfg["no_a_quant"]: + a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale) + + def run(): + return torch.ops.fbgemm.f4f4bf16( + a_fp4, + b_fp4, + scale_a_fp4, + scale_b_fp4, + global_scale=alpha, + use_mx=False, + ) + + return run + else: + + def run(): + a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale) + return torch.ops.fbgemm.f4f4bf16( + a_fp4, + b_fp4, + scale_a_fp4, + scale_b_fp4, + global_scale=alpha, + use_mx=False, + ) + + return run if cfg["no_a_quant"]: # Pre-quantize activation @@ -130,10 +184,13 @@ def prepare_shapes(args): for K, N, model in prepare_shapes(args): print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:") + save_dir = f"bench_nvfp4_res_n{N}_k{K}" + os.makedirs(save_dir, exist_ok=True) + benchmark.run( print_data=True, show_plots=True, - save_path=f"bench_nvfp4_res_n{N}_k{K}", + save_path=save_dir, N=N, K=K, ) diff --git a/benchmarks/kernels/bench_nvfp4_qutlass.py b/benchmarks/kernels/bench_nvfp4_qutlass.py new file mode 100644 index 000000000000..6fecc816f946 --- /dev/null +++ b/benchmarks/kernels/bench_nvfp4_qutlass.py @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at). +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import copy +import itertools + +import torch +from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm +from vllm._custom_ops import fusedQuantizeNv +from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "nvfp4": dict(no_a_quant=False, enabled=True), + "nvfp4-noquant": dict(no_a_quant=True, enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device): + return ( + deterministic_hadamard_matrix(group_size, dtype=dtype, device=device) + * group_size**-0.5 + ) + + +def _quant_weight_nvfp4( + b: torch.Tensor, + forward_hadamard_matrix: torch.Tensor, + global_scale: torch.Tensor, + device: str, + M: int, + N: int, + K: int, +): + weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv( + b, forward_hadamard_matrix, global_scale + ) + weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view( + -1, K // 16 + ) + return weight_hf_e2m1, weight_hf_scale_block + + +def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K): + alpha = torch.tensor([1.0], device="cuda") + global_scale = torch.tensor([1.0], device="cuda") + weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4( + b, forward_hadamard_matrix, global_scale, device, M, N, K + ) + + if cfg["no_a_quant"]: + # Pre-quantize activation + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv( + a, forward_hadamard_matrix, global_scale + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view( + -1, K // 16 + ) + + def run(): + return ops.cutlass_scaled_fp4_mm( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + torch.bfloat16, + ) + + return run + + # Quantize activation on-the-fly + def run(): + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv( + a, forward_hadamard_matrix, global_scale + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view( + -1, K // 16 + ) + return ops.cutlass_scaled_fp4_mm( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + torch.bfloat16, + ) + + return run + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[ + 1, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 24576, + 32768, + ], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs NVFP4 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K, had_size): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_nvfp4_runner( + cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K + ) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), rep=200, quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.3-70B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + for had_size in [16, 32, 64, 128]: + print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_nvfp4_res_n{N}_k{K}", + N=N, + K=K, + had_size=had_size, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py index 923d678f1f2d..d33b84fc3601 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -1,15 +1,27 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Callable +from collections.abc import Callable +from unittest.mock import patch +import pandas as pd import torch -from vllm import _custom_ops as ops -from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.triton_utils import triton +from vllm.utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE + + +def with_triton_mode(fn): + """Temporarily force the Triton fallback path""" + + def wrapped(*args, **kwargs): + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + return fn(*args, **kwargs) + + return wrapped # TODO(luka): use standalone_compile utility @@ -21,78 +33,238 @@ def inner(*args): return inner -torch._dynamo.config.recompile_limit = 8888 -compilation_config = CompilationConfig(custom_ops=["none"]) -with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)): - torch_per_token_quant_fp8 = torch.compile( - QuantFP8(False, GroupShape.PER_TOKEN), - fullgraph=True, - dynamic=False, # recompile for different shapes - ) +def bench_compile(fn: Callable): + # recompile for different shapes + fwd = torch.compile(fn, fullgraph=True, dynamic=False) # First dim is explicitly dynamic to simulate vLLM usage - torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0) + return with_dyn_arg(fwd, 0, 0) -def cuda_per_token_quant_fp8( - input: torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor]: - return ops.scaled_fp8_quant(input) +torch._dynamo.config.recompile_limit = 8888 -def calculate_diff(batch_size: int, seq_len: int): - """Calculate difference between Triton and CUDA implementations.""" +def calculate_diff( + batch_size: int, + hidden_size: int, + group_shape: GroupShape, + dtype: torch.dtype, +): + """Calculate the difference between Inductor and CUDA implementations.""" device = torch.device("cuda") - x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device) + x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device) + + quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False) - torch_out, torch_scale = torch_per_token_quant_fp8(x) - cuda_out, cuda_scale = cuda_per_token_quant_fp8(x) + torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x) + torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x) + cuda_out, cuda_scale = quant_fp8.forward_cuda(x) - if torch.allclose( - cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5 - ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5): + try: + torch.testing.assert_close( + cuda_out.to(torch.float32), + torch_out.to(torch.float32), + rtol=1e-3, + atol=1e-5, + ) + torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5) + torch.testing.assert_close( + cuda_out.to(torch.float32), + torch_eager_out.to(torch.float32), + rtol=1e-3, + atol=1e-5, + ) + torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5) print("✅ All implementations match") - else: + except AssertionError as e: print("❌ Implementations differ") + print(e) -batch_size_range = [1, 16, 32, 64, 128] -seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] +configs = [] -configs = list(itertools.product(batch_size_range, seq_len_range)) - -@triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["batch_size", "seq_len"], - x_vals=configs, - line_arg="provider", - line_vals=["torch", "cuda"], - line_names=["Torch", "CUDA"], - styles=[("blue", "-"), ("green", "-")], - ylabel="us", - plot_name="per-token-dynamic-quant-fp8-performance", - args={}, - ) -) -def benchmark_quantization(batch_size, seq_len, provider): - dtype = torch.float16 +def benchmark_quantization( + batch_size, + hidden_size, + provider, + group_shape: GroupShape, + col_major: bool, + dtype: torch.dtype, +): device = torch.device("cuda") - x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype) + x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype) quantiles = [0.5, 0.2, 0.8] + quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major) if provider == "torch": - fn = lambda: torch_per_token_quant_fp8(x.clone()) + fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone()) elif provider == "cuda": - fn = lambda: cuda_per_token_quant_fp8(x.clone()) + fn = lambda: quant_fp8.forward_cuda(x.clone()) + elif provider == "triton": + if not group_shape.is_per_group(): + # Triton only supported for per-group + return 0, 0, 0 + + fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone()) ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles) return 1000 * ms, 1000 * max_ms, 1000 * min_ms +# TODO(luka) extract to utils +def compute_geomean_speedups( + df: pd.DataFrame, + baseline_col: str, + speedup_cols: list[str], + groupby_cols: list[str] | None = None, +) -> pd.DataFrame: + """ + Compute geometric mean speedups over a baseline column. + + Args: + df: Input dataframe + baseline_col: Column to use as baseline + speedup_cols: Columns to compute speedups for + groupby_cols: Columns to group by. If None, compute over entire df. + + Returns: + pd.DataFrame with geometric mean speedups + """ + from scipy.stats import gmean + + def geo_speedup(group: pd.DataFrame) -> pd.Series: + ratios = { + col: (group[baseline_col] / group[col]).values for col in speedup_cols + } + return pd.Series({col: gmean(vals) for col, vals in ratios.items()}) + + if groupby_cols is None: + result = geo_speedup(df).to_frame().T + else: + result = ( + df.groupby(groupby_cols) + .apply(geo_speedup, include_groups=False) + .reset_index() + ) + + return result + + if __name__ == "__main__": - calculate_diff(batch_size=4, seq_len=4096) - benchmark_quantization.run(print_data=True) + parser = FlexibleArgumentParser( + description="Benchmark the various implementations of QuantFP8 (dynamic-only)" + ) + parser.add_argument("-c", "--check", action="store_true") + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" + ) + parser.add_argument( + "--hidden-sizes", + type=int, + nargs="+", + default=[896, 1024, 2048, 4096, 7168], + help="Hidden sizes to benchmark", + ) + parser.add_argument( + "--batch-sizes", + type=int, + nargs="+", + default=[1, 16, 128, 512, 1024], + help="Batch sizes to benchmark", + ) + parser.add_argument( + "--group-sizes", + type=int, + nargs="+", + default=None, + help="Group sizes for GroupShape(1,N) to benchmark. " + "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)", + ) + parser.add_argument( + "--no-column-major", + action="store_true", + help="Disable column-major scales testing", + ) + + args = parser.parse_args() + assert args + + dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] + + hidden_sizes = args.hidden_sizes + batch_sizes = args.batch_sizes + + if args.group_sizes is not None: + group_shapes = [] + for size in args.group_sizes: + if size == 0: + group_shapes.append(GroupShape.PER_TENSOR) + elif size == -1: + group_shapes.append(GroupShape.PER_TOKEN) + else: + group_shapes.append(GroupShape(1, size)) + else: + group_shapes = [ + GroupShape.PER_TENSOR, + GroupShape.PER_TOKEN, + GroupShape(1, 64), + GroupShape(1, 128), + ] + + column_major_scales = [False] if args.no_column_major else [True, False] + + config_gen = itertools.product( + group_shapes, + column_major_scales, + batch_sizes, + hidden_sizes, + ) + + # filter out column-major scales for non-group, reverse order + configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1])) + + print(f"Running {len(configs)} configurations:") + print(f" Hidden sizes: {hidden_sizes}") + print(f" Batch sizes: {batch_sizes}") + print(f" Group shapes: {[str(g) for g in group_shapes]}") + print(f" Column major scales: {column_major_scales}") + print() + + if args.check: + for group_shape in group_shapes: + group_size = group_shape[1] + print(f"{group_size=}") + calculate_diff( + batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype + ) + + benchmark = triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["hidden_size", "batch_size", "col_major", "group_shape"], + x_vals=configs, + line_arg="provider", + line_vals=["torch", "cuda", "triton"], + line_names=["Torch (Compiled)", "CUDA", "Triton"], + styles=[("blue", "-"), ("green", "-"), ("black", "-")], + ylabel="us", + plot_name="QuantFP8 performance", + args={}, + ) + )(benchmark_quantization) + + df = benchmark.run(print_data=True, dtype=dtype, return_df=True) + + # Print geomean speedups + geo_table_grouped = compute_geomean_speedups( + df, + baseline_col="Torch (Compiled)", + speedup_cols=["CUDA", "Triton"], + groupby_cols=["col_major", "group_shape"], + ) + + print("Speedup over Torch (Compiled)") + print(geo_table_grouped.to_string(index=False)) diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py new file mode 100644 index 000000000000..7662655b5efa --- /dev/null +++ b/benchmarks/kernels/benchmark_activation.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# benchmark custom activation op performance +import itertools + +import torch + +import vllm.model_executor.layers.activation # noqa F401 +from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform +from vllm.triton_utils import triton +from vllm.utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE + +batch_size_range = [1, 16, 32, 64, 128] +seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] +intermediate_size = [3072, 9728, 12288] +configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) + + +def benchmark_activation( + batch_size: int, + seq_len: int, + intermediate_size: int, + provider: str, + func_name: str, + dtype: torch.dtype, +): + device = "cuda" + num_tokens = batch_size * seq_len + dim = intermediate_size + current_platform.seed_everything(42) + torch.set_default_device(device) + + if func_name == "gelu_and_mul": + layer = CustomOp.op_registry[func_name](approximate="none") + elif func_name == "gelu_and_mul_tanh": + layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh") + elif func_name == "fatrelu_and_mul": + threshold = 0.5 + layer = CustomOp.op_registry[func_name](threshold) + else: + layer = CustomOp.op_registry[func_name]() + + x = torch.randn(num_tokens, dim, dtype=dtype, device=device) + compiled_layer = torch.compile(layer.forward_native) + + if provider == "custom": + fn = lambda: layer(x) + elif provider == "compiled": + fn = lambda: compiled_layer(x) + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + fn, quantiles=[0.5, 0.2, 0.8] + ) + return ms, max_ms, min_ms + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the custom activation op.") + parser.add_argument( + "--func-name", + type=str, + choices=[ + "mul_and_silu", + "silu_and_mul", + "gelu_and_mul", + "gelu_and_mul_tanh", + "fatrelu_and_mul", + "swigluoai_and_mul", + "gelu_new", + "gelu_fast", + "quick_gelu", + ], + default="silu_and_mul", + ) + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" + ) + args = parser.parse_args() + assert args + + func_name = args.func_name + dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype] + + perf_report = triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size", "seq_len", "intermediate_size"], + x_vals=configs, + line_arg="provider", + line_vals=["custom", "compiled"], + line_names=["Custom OP", "Compiled"], + styles=[("blue", "-"), ("green", "-")], + ylabel="ms", + plot_name=f"{func_name}-op-performance", + args={}, + ) + ) + + perf_report( + lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation( + batch_size, seq_len, intermediate_size, provider, func_name, dtype + ) + ).run(print_data=True) diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py deleted file mode 100644 index 42de062b08e4..000000000000 --- a/benchmarks/kernels/benchmark_aqlm.py +++ /dev/null @@ -1,345 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import os -import sys -from typing import Optional - -import torch -import torch.nn.functional as F - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.aqlm import ( - dequantize_weight, - generic_dequantize_gemm, - get_int_dtype, - optimized_dequantize_gemm, -) -from vllm.utils import FlexibleArgumentParser - -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - -def torch_mult( - # [..., in_features] - input: torch.Tensor, - weights: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, -) -> torch.Tensor: - output = F.linear(input, weights) - return output - - -def dequant_out_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - if bias is None: - output = F.linear(input, weights, bias) - orig_shape = output.shape - flattened_output = output.view(-1, output.size(-1)) - f_scales = scales.view(-1, scales.shape[0]) - b_scales = f_scales.expand(flattened_output.shape[0], -1) - flattened_output *= b_scales - return flattened_output.view(orig_shape) - else: - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_weight_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1]) - weights *= b_scales - return F.linear(input, weights, bias) - - -def dequant_no_scale( - # [..., in_features] - input: torch.Tensor, - # [num_out_groups, num_in_groups, num_codebooks] - codes: torch.IntTensor, - # [num_codebooks, codebook_size, out_group_size, in_group_size] - codebooks: torch.Tensor, - # [num_out_groups, 1, 1, 1] - scales: torch.Tensor, - output_partition_sizes: torch.IntTensor, - bias: Optional[torch.Tensor], -) -> torch.Tensor: - weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes) - - return F.linear(input, weights, bias) - - -# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against -# the generic pytorch version. -# Just visual comparison. -def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - count = 0 - for index in range(16): - for i in range(8): - for book in range(nbooks): - codebooks[book, index, 0, i] = count * (10**book) - count += 1 - - print("codes shape", codes.shape) - - for i in range(16): - for book in range(nbooks): - codes[0, i, book] = i - codes[0, -i, book] = i - - weights = dequantize_weight(codes, codebooks, None) - weights2 = ops.aqlm_dequant(codes, codebooks, parts) - - print("weights shape:", weights.shape) - print("weights2 shape:", weights2.shape) - - print("weights are:", weights) - print("weights2 are:", weights2) - - print("first 128 weights are", weights[0, 0:128].to(torch.int32)) - print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32)) - - print("last 128 weights are", weights[0, -128:]) - print("last 128 weights2 are:", weights2[0, -128:]) - - -def main(): - parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") - - # Add arguments - parser.add_argument( - "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)" - ) - parser.add_argument( - "--bits", - type=int, - default=16, - help="Number of bits per code element (default: 16)", - ) - parser.add_argument( - "--test", - type=bool, - default=False, - help="Run the decompression/dequant tester rather than benchmarking " - "(default: False)", - ) - - # Parse the arguments - args = parser.parse_args() - - # Extract values - nbooks = args.nbooks - bits = args.bits - - if args.test: - dequant_test(4096, torch.tensor((4096,)), nbooks, bits) - return - - # Otherwise, benchmark. - methods = [ - ops.aqlm_gemm, - dequant_out_scale, - generic_dequantize_gemm, - optimized_dequantize_gemm, - dequant_weight_scale, - torch_mult, - dequant_no_scale, - ] - - filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv" - print(f"writing benchmarks to file {filename}") - with open(filename, "w") as f: - sys.stdout = f - - print("m | k | n | n parts", end="") - for method in methods: - print(f" | {method.__name__.replace('_', ' ')} (µs)", end="") - print("") - - # These are reasonable prefill sizes. - ksandpartions = ( - (4096, (4096, 4096, 4096)), - (4096, (4096,)), - (4096, (11008, 11008)), - (11008, (4096,)), - ) - - # reasonable ranges for m. - for m in [ - 1, - 2, - 4, - 8, - 10, - 12, - 14, - 16, - 24, - 32, - 48, - 52, - 56, - 64, - 96, - 112, - 128, - 256, - 512, - 1024, - 1536, - 2048, - 3072, - 4096, - ]: - print(f"{m}", file=sys.__stdout__) - for ksp in ksandpartions: - run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods) - - sys.stdout = sys.__stdout__ - - -def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): - # I didn't see visible improvements from increasing these, but feel free :) - num_warmup_trials = 1 - num_trials = 1 - - num_calls = 100 - - # warmup. - for method in methods: - for _ in range(num_warmup_trials): - run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - n = parts.sum().item() - print(f"{m} | {k} | {n} | {parts.tolist()}", end="") - - for method in methods: - best_time_us = 1e20 - for _ in range(num_trials): - kernel_dur_ms = run_timing( - num_calls=num_calls, - m=m, - k=k, - parts=parts, - nbooks=nbooks, - bits=bits, - method=method, - ) - - kernel_dur_us = 1000 * kernel_dur_ms - - if kernel_dur_us < best_time_us: - best_time_us = kernel_dur_us - - print(f" | {kernel_dur_us:.0f}", end="") - - print("") - - -def run_timing( - num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method -) -> float: - n = int(parts.sum().item()) - - device = torch.device("cuda:0") - - input = torch.randn((1, m, k), dtype=torch.float16, device=device) - - code_range = (1 << bits) // 2 - ingroups = 8 - - codes = torch.randint( - -code_range, - code_range, - size=(n, k // ingroups, nbooks), - dtype=get_int_dtype(bits), - device=device, - ) - - codebooks = torch.randn( - size=(parts.shape[0] * nbooks, 1 << bits, 1, 8), - dtype=torch.float16, - device=device, - ) - - scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device) - - # for comparison to just a pytorch mult. - weights = torch.randn((n, k), dtype=torch.float16, device=device) - - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - start_event.record() - - if method is torch_mult: - for i in range(num_calls): - torch_mult(input, weights, scales) - else: - for i in range(num_calls): - method(input, codes, codebooks, scales, parts, None) - - end_event.record() - end_event.synchronize() - - dur_ms = start_event.elapsed_time(end_event) / num_calls - return dur_ms - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index 97ee06034137..66b44c27d6ee 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -3,6 +3,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from packaging import version + from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( MINIMUM_BITBLAS_VERSION, ) @@ -10,7 +12,7 @@ try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}" diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index 35c20ee41b9a..726a2a371d10 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -13,6 +13,10 @@ from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import ( + fp8_w8a8_moe_quant_config, + nvfp4_moe_quant_config, +) from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types @@ -140,6 +144,12 @@ def run_triton_moe( a_fp8_scale: torch.Tensor, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) + for _ in range(num_repeats): fused_experts( a, @@ -147,10 +157,7 @@ def run_triton_moe( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, + quant_config=quant_config, ) def run_cutlass_moe_fp4( @@ -172,25 +179,27 @@ def run_cutlass_moe_fp4( device: torch.device, num_repeats: int, ): + quant_config = nvfp4_moe_quant_config( + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + g1_alphas=w1_gs, + g2_alphas=w2_gs, + ) for _ in range(num_repeats): with nvtx.annotate("cutlass_moe_fp4", color="green"): cutlass_moe_fp4( a=a, - a1_gscale=a1_gs, - a2_gscale=a2_gs, w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_gs, w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_gs, topk_weights=topk_weights, topk_ids=topk_ids, m=m, n=n, k=k, e=num_experts, - device=device, + quant_config=quant_config, ) def run_cutlass_from_graph( @@ -211,26 +220,29 @@ def run_cutlass_from_graph( e: int, device: torch.device, ): + quant_config = nvfp4_moe_quant_config( + a1_gscale=a1_gs, + a2_gscale=a2_gs, + w1_scale=w1_blockscale, + w2_scale=w2_blockscale, + g1_alphas=w1_gs, + g2_alphas=w2_gs, + ) + with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): return cutlass_moe_fp4( a=a, - a1_gscale=a1_gs, w1_fp4=w1_fp4, - w1_blockscale=w1_blockscale, - w1_alphas=w1_alphas, - a2_gscale=a2_gs, w2_fp4=w2_fp4, - w2_blockscale=w2_blockscale, - w2_alphas=w2_alphas, topk_weights=topk_weights, topk_ids=topk_ids, m=m, n=n, k=k, e=num_experts, - device=device, + quant_config=quant_config, ) def run_triton_from_graph( @@ -246,16 +258,18 @@ def run_triton_from_graph( with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_fp8_scale, + ) return fused_experts( a, w1, w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_fp8_scale, + quant_config=quant_config, ) def replay_graph(graph, num_repeats): diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py new file mode 100644 index 000000000000..b419b2fa0e3e --- /dev/null +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -0,0 +1,406 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe +kernel. Both kernels take in fp8 quantized weights and 16-bit activations, +but use different quantization strategies and backends. +""" + +import nvtx +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + +# Weight shapes for different models: [num_experts, topk, hidden_size, +# intermediate_size] +WEIGHT_SHAPES_MOE = { + "mixtral-8x7b": [ + [8, 2, 4096, 14336], + ], + "deepseek-v2": [ + [160, 6, 5120, 12288], + ], + "custom-small": [ + [8, 2, 2048, 7168], + ], + "glm45-fp8": [ + [128, 8, 4096, 1408], + ], + "Llama-4-Maverick-17B-128E-Instruct-FP8": [ + [128, 1, 5120, 8192], + ], +} + +DEFAULT_MODELS = [ + "mixtral-8x7b", +] + +DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] +DEFAULT_TP_SIZES = [1] + +PER_ACT_TOKEN_OPTS = [False, True] +PER_OUT_CH_OPTS = [False, True] + +FP8_DTYPE = current_platform.fp8_dtype() + + +def bench_run( + results: list, + model: str, + num_experts: int, + topk: int, + per_act_token: bool, + per_out_ch: bool, + mkn: tuple[int, int, int], +): + (m, k, n) = mkn + + dtype = torch.half + device = "cuda" + + # Create input activations + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + + # Create weights + w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10 + w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10 + + # Create FP8 quantized weights and scales for both kernels + w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE) + w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE) + + # Create scales based on quantization strategy + if per_out_ch: + # Per-channel quantization + w1_scale = torch.empty( + (num_experts, 2 * n, 1), device=device, dtype=torch.float32 + ) + w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32) + else: + # Per-tensor quantization + w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32) + + # Quantize weights + for expert in range(num_experts): + if per_out_ch: + # Per-channel quantization - not yet implemented properly + # For now, fall back to per-tensor quantization + w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) + w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) + # Expand scalar scales to the expected per-channel shape + w1_scale[expert] = w1_scale_temp.expand(2 * n, 1) + w2_scale[expert] = w2_scale_temp.expand(k, 1) + else: + # Per-tensor quantization + w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert]) + w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert]) + # Store scalar scales in [1, 1] tensors + w1_scale[expert, 0, 0] = w1_scale_temp + w2_scale[expert, 0, 0] = w2_scale_temp + + # Prepare weights for CUTLASS (no transpose needed) + w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K] + w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N] + + # Create router scores and get topk + score = torch.randn((m, num_experts), device=device, dtype=dtype) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) + + # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization + # Force per-tensor quantization for all cases to match working e2e setup + a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) + a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32) + + # Force per-tensor quantization for all cases + per_act_token = False + + # Create stride tensors for CUTLASS + ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device) + ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device) + c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device) + c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device) + + def run_triton_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor, + a2_scale: torch.Tensor, + num_repeats: int, + ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + for _ in range(num_repeats): + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + + def run_cutlass_moe_fp8( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + a1_scale: torch.Tensor, + a2_scale: torch.Tensor, + num_repeats: int, + ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + for _ in range(num_repeats): + with nvtx.annotate("cutlass_moe_fp8", color="blue"): + cutlass_moe_fp8( + a=a, + w1_q=w1, + w2_q=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + quant_config=quant_config, + activation="silu", + global_num_experts=num_experts, + ) + + # Pre-create quantization config to avoid creating it inside CUDA graph + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + per_act_token_quant=per_act_token, + per_out_ch_quant=per_out_ch, + ) + + # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly) + cutlass_stream = torch.cuda.Stream() + cutlass_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cutlass_graph, stream=cutlass_stream): + # Capture 10 invocations like benchmark_moe.py + for _ in range(10): + cutlass_moe_fp8( + a=a, + w1_q=w1_fp8q_cutlass, + w2_q=w2_fp8q_cutlass, + topk_weights=topk_weights, + topk_ids=topk_ids, + ab_strides1=ab_strides1, + ab_strides2=ab_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + quant_config=quant_config, + activation="silu", + global_num_experts=num_experts, + ) + torch.cuda.synchronize() + + # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly) + triton_stream = torch.cuda.Stream() + triton_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(triton_graph, stream=triton_stream): + # Capture 10 invocations like benchmark_moe.py + for _ in range(10): + fused_experts( + a, + w1_fp8q, + w2_fp8q, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + torch.cuda.synchronize() + + def bench_cuda_graph(graph, num_warmup=5, num_iters=100): + """Benchmark CUDA graph using events like benchmark_moe.py""" + # Warmup + for _ in range(num_warmup): + graph.replay() + torch.cuda.synchronize() + + # Timing + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies = [] + for _ in range(num_iters): + torch.cuda.synchronize() + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + + # Divide by 10 since graph contains 10 calls + return sum(latencies) / (num_iters * 10) + + # Benchmark parameters + num_warmup = 5 + num_iters = 100 + + # Benchmark only CUDA graphs (more reliable and faster) + # Benchmark Triton MoE with CUDA graphs + triton_graph_time = bench_cuda_graph( + triton_graph, num_warmup=num_warmup, num_iters=num_iters + ) + + # Benchmark CUTLASS MoE with CUDA graphs + cutlass_graph_time = bench_cuda_graph( + cutlass_graph, num_warmup=num_warmup, num_iters=num_iters + ) + + # Convert ms to us and return results + triton_time_us = triton_graph_time * 1000 + cutlass_time_us = cutlass_graph_time * 1000 + + return { + "batch_size": m, + "triton_time_us": triton_time_us, + "cutlass_time_us": cutlass_time_us, + } + + +def main(args): + print("Benchmarking models:") + for i, model in enumerate(args.models): + print(f"[{i}] {model}") + + all_results = [] + + for model in args.models: + for tp in args.tp_sizes: + for layer in WEIGHT_SHAPES_MOE[model]: + num_experts = layer[0] + topk = layer[1] + size_k = layer[2] + size_n = layer[3] // tp + + if len(args.limit_k) > 0 and size_k not in args.limit_k: + continue + + if len(args.limit_n) > 0 and size_n not in args.limit_n: + continue + + for per_act_token in args.per_act_token_opts: + for per_out_ch in args.per_out_ch_opts: + print( + f"\n=== {model}, experts={num_experts}, topk={topk}," + f"per_act={per_act_token}, per_out_ch={per_out_ch} ===" + ) + + config_results = [] + for size_m in args.batch_sizes: + mkn = (size_m, size_k, size_n) + result = bench_run( + [], # Not used anymore + model, + num_experts, + topk, + per_act_token, + per_out_ch, + mkn, + ) + if result: + config_results.append(result) + + # Print results table for this configuration + if config_results: + print( + f"\n{'Batch Size':<12}" + f"{'Triton (us)':<15}" + f"{'CUTLASS (us)':<15}" + ) + print("-" * 45) + for result in config_results: + print( + f"{result['batch_size']:<12}" + f"{result['triton_time_us']:<15.2f}" + f"{result['cutlass_time_us']:<15.2f}" + ) + + all_results.extend(config_results) + + print(f"\nTotal benchmarks completed: {len(all_results)}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE + across specified models/shapes/batches + + Example usage: + python benchmark_cutlass_moe_fp8.py \ + --model "Llama-4-Maverick-17B-128E-Instruct-FP8" \ + --tp-sizes 8 \ + --batch-size 2 4 8 \ + --per-act-token-opts false \ + --per-out-ch-opts false + + """ + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES_MOE.keys(), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES) + parser.add_argument( + "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES + ) + parser.add_argument("--limit-k", nargs="+", type=int, default=[]) + parser.add_argument("--limit-n", nargs="+", type=int, default=[]) + parser.add_argument( + "--per-act-token-opts", + nargs="+", + type=lambda x: x.lower() == "true", + default=[False, True], + help="Per-activation token quantization options (true/false)", + ) + parser.add_argument( + "--per-out-ch-opts", + nargs="+", + type=lambda x: x.lower() == "true", + default=[False, True], + help="Per-output channel quantization options (true/false)", + ) + + args = parser.parse_args() + main(args) diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py new file mode 100644 index 000000000000..df06a940e6d4 --- /dev/null +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Benchmark script for device communicators: +CustomAllreduce (oneshot, twoshot), PyNcclCommunicator, +and SymmMemCommunicator (multimem, two-shot). + +for NCCL symmetric memory you need to set the environment variables +NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does +not use fast NVLS implementation for all reduce. + +Usage: + torchrun --nproc_per_node= benchmark_device_communicators.py [options] + +Example: + torchrun --nproc_per_node=2 benchmark_device_communicators.py + --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100 +""" + +import json +import os +import time +from collections.abc import Callable +from contextlib import nullcontext + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce +from vllm.distributed.device_communicators.pynccl import ( + PyNcclCommunicator, + register_nccl_symmetric_ops, +) +from vllm.distributed.device_communicators.pynccl_allocator import ( + set_graph_pool_id, +) +from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator +from vllm.logger import init_logger +from vllm.utils import FlexibleArgumentParser + +logger = init_logger(__name__) + +# Default sequence lengths to benchmark +DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] + +# Fixed hidden size and dtype for all benchmarks +HIDDEN_SIZE = 8192 +BENCHMARK_DTYPE = torch.bfloat16 + +# CUDA graph settings +CUDA_GRAPH_CAPTURE_CYCLES = 10 + + +class CommunicatorBenchmark: + """Benchmark class for testing device communicators.""" + + def __init__( + self, + rank: int, + world_size: int, + device: torch.device, + cpu_group: ProcessGroup, + sequence_lengths: list[int], + ): + self.rank = rank + self.world_size = world_size + self.device = device + self.cpu_group = cpu_group + + # Calculate max_size_override based on largest sequence length + max_seq_len = max(sequence_lengths) + max_tensor_elements = max_seq_len * HIDDEN_SIZE + self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1 + + # Initialize communicators + self.custom_allreduce = None + self.pynccl_comm = None + self.symm_mem_comm = None + self.symm_mem_comm_multimem = None + self.symm_mem_comm_two_shot = None + + self._init_communicators() + + def _init_communicators(self): + """Initialize all available communicators.""" + try: + self.custom_allreduce = CustomAllreduce( + group=self.cpu_group, + device=self.device, + max_size=self.max_size_override, + ) + if not self.custom_allreduce.disabled: + logger.info("Rank %s: CustomAllreduce initialized", self.rank) + else: + logger.info("Rank %s: CustomAllreduce disabled", self.rank) + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e + ) + self.custom_allreduce = None + + try: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, device=self.device + ) + if not self.pynccl_comm.disabled: + logger.info("Rank %s: PyNcclCommunicator initialized", self.rank) + register_nccl_symmetric_ops(self.pynccl_comm) + else: + logger.info("Rank %s: PyNcclCommunicator disabled", self.rank) + self.pynccl_comm = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e + ) + self.pynccl_comm = None + + # Initialize variants for SymmMemCommunicator + try: + self.symm_mem_comm_multimem = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=True, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_multimem.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank + ) + else: + self.symm_mem_comm_multimem = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s", + self.rank, + e, + ) + self.symm_mem_comm_multimem = None + + try: + self.symm_mem_comm_two_shot = SymmMemCommunicator( + group=self.cpu_group, + device=self.device, + force_multimem=False, + max_size_override=self.max_size_override, + ) + if not self.symm_mem_comm_two_shot.disabled: + logger.info( + "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank + ) + else: + self.symm_mem_comm_two_shot = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s", + self.rank, + e, + ) + self.symm_mem_comm_two_shot = None + + def benchmark_allreduce( + self, sequence_length: int, num_warmup: int, num_trials: int + ) -> dict[str, float]: + """Benchmark allreduce operations for all available communicators.""" + + results = {} + + # Define communicators with their benchmark functions + communicators = [] + + if self.custom_allreduce is not None: + comm = self.custom_allreduce + # CustomAllreduce one-shot + communicators.append( + ( + "ca_1stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "1stage", # env variable value + ) + ) + # CustomAllreduce two-shot + communicators.append( + ( + "ca_2stage", + lambda t, c=comm: c.custom_all_reduce(t), + lambda t, c=comm: c.should_custom_ar(t), + comm.capture(), + "2stage", # env variable value + ) + ) + + if self.pynccl_comm is not None: + comm = self.pynccl_comm + communicators.append( + ( + "pynccl", + lambda t, c=comm: c.all_reduce(t), + lambda t: True, # Always available if initialized + nullcontext(), + None, # no env variable needed + ) + ) + communicators.append( + ( + "pynccl-symm", + lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t), + lambda t: True, # Always available if initialized + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_multimem is not None: + comm = self.symm_mem_comm_multimem + communicators.append( + ( + "symm_mem_multimem", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + if self.symm_mem_comm_two_shot is not None: + comm = self.symm_mem_comm_two_shot + communicators.append( + ( + "symm_mem_two_shot", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_symm_mem(t), + nullcontext(), + None, # no env variable needed + ) + ) + + # Benchmark each communicator + for name, allreduce_fn, should_use_fn, context, env_var in communicators: + # Set environment variable if needed + if env_var is not None: + os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var + else: + # Clear the environment variable to avoid interference + os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) + + latency = self.benchmark_allreduce_single( + sequence_length, + allreduce_fn, + should_use_fn, + context, + num_warmup, + num_trials, + ) + if latency is not None: + results[name] = latency + + return results + + def benchmark_allreduce_single( + self, + sequence_length: int, + allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], + should_use_fn: Callable[[torch.Tensor], bool], + context, + num_warmup: int, + num_trials: int, + ) -> float | None: + """Benchmark method with CUDA graph optimization.""" + try: + # Create test tensor (2D: sequence_length x hidden_size) + tensor = torch.randn( + sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device + ) + if not should_use_fn(tensor): + return None + + torch.cuda.synchronize() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + graph_input = tensor.clone() + + # Warmup before capture + for _ in range(3): + allreduce_fn(graph_input) + + # Capture the graph using context manager + with context: + graph = torch.cuda.CUDAGraph() + graph_pool = torch.cuda.graph_pool_handle() + set_graph_pool_id(graph_pool) + with torch.cuda.graph(graph, pool=graph_pool): + for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): + allreduce_fn(graph_input) + + torch.cuda.synchronize() + for _ in range(num_warmup): + graph.replay() + torch.cuda.synchronize() + + torch.cuda.synchronize() + start_time = time.perf_counter() + + for _ in range(num_trials): + graph.replay() + torch.cuda.synchronize() + + end_time = time.perf_counter() + + # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES + return ( + (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000 + ) + + except Exception as e: + logger.error("CUDA graph benchmark failed: %s", e) + raise RuntimeError( + f"CUDA graph benchmark failed for communicator: {e}" + ) from e + + +def _calculate_speedup_info(comm_results: dict[str, float]) -> str: + """Calculate speedup information for a single tensor size.""" + if not comm_results: + return "N/A" + + # Find the fastest communicator + fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k]) + fastest_time = comm_results[fastest_comm] + + # Calculate speedup vs PyNccl if available + if "pynccl" in comm_results: + pynccl_time = comm_results["pynccl"] + speedup = pynccl_time / fastest_time + return f"{fastest_comm} ({speedup:.2f}x)" + else: + return f"{fastest_comm} (N/A)" + + +def print_results( + results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int +): + """Print benchmark results in a formatted table.""" + + print(f"\n{'=' * 130}") + print("Device Communicator Benchmark Results") + print( + f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, " + f"Hidden Size: {HIDDEN_SIZE}" + ) + print(f"{'=' * 130}") + + # Get all communicator names + all_comms = set() + for size_results in results.values(): + all_comms.update(size_results.keys()) + + all_comms = sorted(list(all_comms)) + + # Print header + header = f"{'Tensor Shape':<20}{'Tensor Size':<15}" + for comm in all_comms: + header += f"{comm:<20}" + header += f"{'Best (Speedup vs PyNccl)':<30}" + print(header) + print("-" * len(header)) + + # Print results for each sequence length + for seq_len in sequence_lengths: + if seq_len in results: + # Calculate tensor size in elements and bytes + tensor_elements = seq_len * HIDDEN_SIZE + tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize + + # Format tensor size (MB) + tensor_size_mb = tensor_bytes / (1024 * 1024) + tensor_size_str = f"{tensor_size_mb:.2f} MB" + + # Format tensor shape + tensor_shape = f"({seq_len}, {HIDDEN_SIZE})" + + row = f"{tensor_shape:<20}{tensor_size_str:<15}" + for comm in all_comms: + if comm in results[seq_len]: + row += f"{results[seq_len][comm]:<20.3f}" + else: + row += f"{'N/A':<20}" + + # Calculate speedup information + speedup_info = _calculate_speedup_info(results[seq_len]) + row += f"{speedup_info:<30}" + + print(row) + + print(f"{'=' * 130}") + print("All times are in milliseconds (ms) per allreduce operation") + print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)") + + +def main(): + parser = FlexibleArgumentParser(description="Benchmark device communicators") + + parser.add_argument( + "--sequence-lengths", + type=int, + nargs="+", + default=DEFAULT_SEQUENCE_LENGTHS, + help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)", + ) + + parser.add_argument( + "--num-warmup", type=int, default=5, help="Number of warmup iterations" + ) + + parser.add_argument( + "--num-trials", type=int, default=50, help="Number of benchmark trials" + ) + + parser.add_argument("--output-json", type=str, help="Output results to JSON file") + + args = parser.parse_args() + + # Initialize distributed + if not dist.is_initialized(): + dist.init_process_group(backend="gloo") + rank = dist.get_rank() + world_size = dist.get_world_size() + + # Set device + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + + # Get CPU process group + cpu_group = dist.new_group(backend="gloo") + + # Disable USE_SYMM_MEM to avoid affecting the max_sizes + # in symm_mem and custom_all_reduce for benchmark + os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" + + # Initialize benchmark + benchmark = CommunicatorBenchmark( + rank, world_size, device, cpu_group, args.sequence_lengths + ) + + # Run benchmarks + all_results = {} + + for seq_len in args.sequence_lengths: + if rank == 0: + logger.info( + "Benchmarking sequence length: %s (tensor shape: %s x %s)", + seq_len, + seq_len, + HIDDEN_SIZE, + ) + + results = benchmark.benchmark_allreduce( + sequence_length=seq_len, + num_warmup=args.num_warmup, + num_trials=args.num_trials, + ) + + all_results[seq_len] = results + + # Synchronize between ranks + dist.barrier() + + # Print results (only rank 0) + if rank == 0: + print_results(all_results, args.sequence_lengths, world_size) + + # Save to JSON if requested + if args.output_json: + # Add speedup information to results + enhanced_results = {} + for seq_len, comm_results in all_results.items(): + enhanced_results[seq_len] = { + "timings": comm_results, + "speedup_info": _calculate_speedup_info(comm_results), + } + + output_data = { + "world_size": world_size, + "dtype": str(BENCHMARK_DTYPE), + "hidden_size": HIDDEN_SIZE, + "sequence_lengths": args.sequence_lengths, + "num_warmup": args.num_warmup, + "num_trials": args.num_trials, + "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES, + "results": enhanced_results, + } + + with open(args.output_json, "w") as f: + json.dump(output_data, f, indent=2) + + logger.info("Results saved to %s", args.output_json) + + # Cleanup + if cpu_group != dist.group.WORLD: + dist.destroy_process_group(cpu_group) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 1d4e730f99ae..14330ae6f03c 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -7,6 +7,7 @@ from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, @@ -80,6 +81,11 @@ def bench_run( a, score, topk, renormalize=False ) + ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) + c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) + c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) + def run_triton_moe( a: torch.Tensor, w1: torch.Tensor, @@ -91,6 +97,11 @@ def run_triton_moe( a_scale: torch.Tensor, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) for _ in range(num_repeats): fused_experts( a, @@ -98,10 +109,7 @@ def run_triton_moe( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, + quant_config=quant_config, ) def run_cutlass_moe( @@ -111,11 +119,21 @@ def run_cutlass_moe( w2: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, per_act_token: bool, num_repeats: int, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + per_act_token_quant=per_act_token, + ) + for _ in range(num_repeats): cutlass_moe_fp8( a, @@ -123,10 +141,11 @@ def run_cutlass_moe( w2, topk_weights, topk_ids, - w1_scale, - w2_scale, - per_act_token, - a1_scale=None, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, + quant_config=quant_config, ) def run_cutlass_from_graph( @@ -136,9 +155,19 @@ def run_cutlass_from_graph( w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + ab_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + per_act_token_quant=per_act_token, + ) + with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): @@ -148,10 +177,11 @@ def run_cutlass_from_graph( w2_q, topk_weights, topk_ids, - w1_scale, - w2_scale, - per_act_token, - a1_scale=None, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, + quant_config=quant_config, ) def run_triton_from_graph( @@ -164,6 +194,11 @@ def run_triton_from_graph( w2_scale: torch.Tensor, a_scale: torch.Tensor, ): + quant_config = fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a_scale, + ) with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): @@ -173,10 +208,7 @@ def run_triton_from_graph( w2, topk_weights, topk_ids, - use_fp8_w8a8=True, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a_scale, + quant_config=quant_config, ) def replay_graph(graph, num_repeats): @@ -194,6 +226,10 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, ) @@ -231,6 +267,10 @@ def replay_graph(graph, num_repeats): "w1_scale": w1_scale, "w2_scale": w2_scale, "per_act_token": per_act_token, + "ab_strides1": ab_strides1, + "ab_strides2": ab_strides2, + "c_strides1": c_strides1, + "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -289,6 +329,10 @@ def replay_graph(graph, num_repeats): w2_q, w1_scale, w2_scale, + ab_strides1, + ab_strides2, + c_strides1, + c_strides2, topk_weights, topk_ids, per_act_token, @@ -297,7 +341,7 @@ def replay_graph(graph, num_repeats): results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 69978ec6b23e..bcfa64c3f425 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -7,7 +7,8 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE @torch.inference_mode() diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 3d38d4b3534e..39338f338761 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -6,11 +6,12 @@ import json import pickle import time +from collections.abc import Callable from dataclasses import dataclass from enum import Enum, auto from itertools import product from pathlib import Path -from typing import Any, Callable, Optional +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -79,9 +80,9 @@ def make_rand_lora_weight_tensor( def make_rand_tensors( - a_shape: tuple[int], - b_shape: tuple[int], - c_shape: tuple[int], + a_shape: tuple[int, ...], + b_shape: tuple[int, ...], + c_shape: tuple[int, ...], a_dtype: torch.dtype, b_dtype: torch.dtype, c_dtype: torch.dtype, @@ -158,7 +159,7 @@ def ref_group_gemm( seq_lens_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor, scaling: float, - add_inputs: Optional[bool], + add_inputs: bool | None, ): """ Torch group gemm reference implementation to test correctness of @@ -243,7 +244,7 @@ def matmul_shapes( lora_rank: int, num_loras: int, num_slices: int, - ) -> tuple[tuple[int], tuple[int], tuple[int]]: + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: """ Given num_slices, return the shapes of the A, B, and C matrices in A x B = C, for the op_type @@ -316,8 +317,8 @@ class BenchmarkContext: lora_rank: int sort_by_lora_id: bool dtype: torch.dtype - seq_length: Optional[int] = None - num_slices: Optional[int] = None # num_slices for slice based ops + seq_length: int | None = None + num_slices: int | None = None # num_slices for slice based ops def with_seq_length(self, seq_length: int) -> "BenchmarkContext": ctx = copy.copy(self) @@ -464,7 +465,11 @@ def to_device(tensor: torch.Tensor): for field_name in LoRAKernelMeta.__dataclass_fields__: field = getattr(self.lora_kernel_meta, field_name) assert isinstance(field, torch.Tensor) - setattr(self.lora_kernel_meta, field_name, to_device(field)) + setattr( + self.lora_kernel_meta, + field_name, + to_device(field) if field_name != "no_lora_flag_cpu" else field, + ) def metadata(self) -> tuple[int, int, int]: """ @@ -512,6 +517,7 @@ def as_lora_shrink_kwargs(self) -> dict[str, Any]: "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc, "lora_ids": self.lora_kernel_meta.active_lora_ids, "scaling": 1.0, + "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: @@ -552,10 +558,11 @@ def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: "lora_ids": self.lora_kernel_meta.active_lora_ids, "offset_start": 0, "add_inputs": add_inputs, + "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } def bench_fn_kwargs( - self, op_type: OpType, add_inputs: Optional[bool] = None + self, op_type: OpType, add_inputs: bool | None = None ) -> dict[str, Any]: if op_type.is_shrink_fn(): assert add_inputs is None @@ -569,7 +576,7 @@ def bench_fn_kwargs( raise ValueError(f"Unrecognized optype {self}") def test_correctness( - self, op_type: OpType, expand_fn_add_inputs: Optional[bool] + self, op_type: OpType, expand_fn_add_inputs: bool | None ) -> bool: """ Test correctness of op_type implementation against a grouped gemm @@ -605,8 +612,8 @@ def bench_optype( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, - expand_fn_add_inputs: Optional[bool] = None, + cuda_graph_nops: int | None = None, + expand_fn_add_inputs: bool | None = None, test_correctness: bool = False, ) -> TMeasurement: assert arg_pool_size >= 1 @@ -637,7 +644,7 @@ def bench_optype( # Clear LoRA optimization hash-maps. _LORA_A_PTR_DICT.clear() _LORA_B_PTR_DICT.clear() - # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) torch.cuda.synchronize() @@ -673,7 +680,7 @@ def bench_torch_mm( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, + cuda_graph_nops: int | None = None, ) -> TMeasurement: """ Benchmark basic torch.mm as a roofline. @@ -738,7 +745,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): +def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): compare = TBenchmark.Compare(timers) compare.print() diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index f73d0511e01f..e1d5239f5cc9 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -8,10 +8,9 @@ import os import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import pandas as pd import torch @@ -63,23 +62,23 @@ class BenchmarkTensors: a: torch.Tensor w_q: torch.Tensor - group_size: Optional[int] + group_size: int | None wtype: ScalarType w_g_s: torch.Tensor - w_g_zp: Optional[torch.Tensor] - w_ch_s: Optional[torch.Tensor] - w_tok_s: Optional[torch.Tensor] + w_g_zp: torch.Tensor | None + w_ch_s: torch.Tensor | None + w_tok_s: torch.Tensor | None @dataclass class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - group_zero_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + group_zero_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None def rand_data(shape, dtype=torch.float16, scale=1): @@ -93,8 +92,8 @@ def quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -113,7 +112,7 @@ def quantize_and_pack( def create_bench_tensors( - shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] + shape: tuple[int, int, int], types: TypeConfig, group_size: int | None ) -> list[BenchmarkTensors]: m, n, k = shape @@ -236,6 +235,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: a=bt.a, c=None, b_q_weight=w_q, + b_bias=None, b_scales=w_s, global_scale=None, b_zeros=w_zp, @@ -252,28 +252,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: else: assert bt.a.dtype == torch.int8 assert bt.wtype == scalar_types.uint4b8 - - if bt.w_ch_s is not None: - s_ch = bt.w_ch_s.to(torch.float32) - else: - s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device) - - if bt.w_tok_s is not None: - s_tok = bt.w_tok_s.to(torch.float32) - else: - s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device) - - fn = lambda: ops.marlin_qqq_gemm( - a=bt.a, - b_q_weight=w_q, - s_group=w_s, - s_tok=s_tok, - s_ch=s_ch, - workspace=workspace.scratch, - size_m=bt.a.shape[0], - size_n=bt.w_ref.shape[1], - size_k=bt.w_ref.shape[0], - ) + raise NotImplementedError("QQQ is not supported anymore") return fn @@ -304,6 +283,25 @@ def machete_create_bench_fn( ) +def cutlass_w4a8_create_bench_fn( + bt: BenchmarkTensors, out_type=torch.dtype, schedule=None +) -> Callable: + w_q = bt.w_q.t().contiguous().t() # make col major + w_q = ops.cutlass_encode_and_reorder_int4b(w_q) + # expects fp8 scales + w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn)) + + return lambda: ops.cutlass_w4a8_mm( + a=bt.a, + b_q=w_q, + b_group_scales=w_s, + b_group_size=bt.group_size, + b_channel_scales=bt.w_ch_s, + a_token_scales=bt.w_tok_s, + maybe_schedule=schedule, + ) + + # impl # bench @@ -332,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]) return res -_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None -_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None +_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None +_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None def bench( @@ -405,6 +403,20 @@ def bench( ) ) + # cutlass w4a8 + if types.act_type == torch.float8_e4m3fn and group_size == 128: + timers.append( + bench_fns( + label, + sub_label, + f"cutlass w4a8 ({name_type_string})", + [ + cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type) + for bt in benchmark_tensors + ], + ) + ) + if sweep_schedules: global _SWEEP_SCHEDULES_RESULTS diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c350aaf5d3ad..9298d3b58dfb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -3,6 +3,7 @@ import argparse import json +import os import time from contextlib import nullcontext from datetime import datetime @@ -13,6 +14,10 @@ import torch from ray.experimental.tqdm_ray import tqdm +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + _get_config_dtype_str, +) from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config @@ -22,6 +27,13 @@ FP8_DTYPE = current_platform.fp8_dtype() +def ensure_divisibility(numerator, denominator, text): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format( + text, numerator, denominator + ) + + class BenchmarkConfig(TypedDict): BLOCK_SIZE_M: int BLOCK_SIZE_N: int @@ -126,43 +138,36 @@ def prepare(i: int): def run(): from vllm.model_executor.layers.fused_moe import override_config + if use_fp8_w8a8: + quant_dtype = torch.float8_e4m3fn + elif use_int8_w8a16: + quant_dtype = torch.int8 + else: + quant_dtype = None + + quant_config = FusedMoEQuantConfig.make( + quant_dtype=quant_dtype, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_quant_shape, + ) + with override_config(config): - if use_deep_gemm: - topk_weights, topk_ids, token_expert_indices = fused_topk( - x, input_gating, topk, False - ) - return fused_experts( - x, - w1, - w2, - topk_weights, - topk_ids, - inplace=True, - use_fp8_w8a8=use_fp8_w8a8, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - allow_deep_gemm=True, - ) - else: - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - block_shape=block_quant_shape, - ) + topk_weights, topk_ids, token_expert_indices = fused_topk( + x, input_gating, topk, renormalize=not use_deep_gemm + ) + return fused_experts( + x, + w1, + w2, + topk_weights, + topk_ids, + inplace=True, + quant_config=quant_config, + allow_deep_gemm=use_deep_gemm, + ) # JIT compilation & warmup run() @@ -406,13 +411,15 @@ def benchmark( use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) - dtype_str = get_config_dtype_str( + dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. + block_n = block_quant_shape[0] if block_quant_shape else None + block_k = block_quant_shape[1] if block_quant_shape else None op_config = get_moe_configs( - num_experts, shard_intermediate_size // 2, dtype_str + num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k ) if op_config is None: config = get_default_config( @@ -422,7 +429,7 @@ def benchmark( hidden_size, topk, dtype_str, - is_marlin=False, + block_quant_shape, ) else: config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))] @@ -535,8 +542,9 @@ def save_configs( use_fp8_w8a8: bool, use_int8_w8a16: bool, block_quant_shape: list[int], + save_dir: str, ) -> None: - dtype_str = get_config_dtype_str( + dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) @@ -545,10 +553,11 @@ def save_configs( filename = get_config_file_name( num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape ) - + os.makedirs(save_dir, exist_ok=True) + filename = os.path.join(save_dir, filename) print(f"Writing best config to {filename}...") with open(filename, "w") as f: - json.dump(configs, f, indent=4) + json.dump({"triton_version": triton.__version__, **configs}, f, indent=4) f.write("\n") @@ -570,31 +579,42 @@ def main(args: argparse.Namespace): E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k intermediate_size = config.ffn_config.ffn_hidden_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size + hidden_size = config.hidden_size elif config.architectures[0] == "JambaForCausalLM": E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size + hidden_size = config.hidden_size elif config.architectures[0] in ( - "DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM", + "DeepseekV3ForCausalLM", + "DeepseekV32ForCausalLM", "Glm4MoeForCausalLM", ): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"): + hidden_size = config.hidden_size + elif config.architectures[0] in ( + "Qwen2MoeForCausalLM", + "Qwen3MoeForCausalLM", + "Qwen3NextForCausalLM", + ): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size - shard_intermediate_size = 2 * intermediate_size // args.tp_size + hidden_size = config.hidden_size + elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration": + text_config = config.get_text_config() + E = text_config.num_experts + topk = text_config.num_experts_per_tok + intermediate_size = text_config.moe_intermediate_size + hidden_size = text_config.hidden_size elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"): E = config.num_experts topk = config.moe_topk[0] intermediate_size = config.moe_intermediate_size[0] - shard_intermediate_size = 2 * intermediate_size // args.tp_size + hidden_size = config.hidden_size else: # Support for llama4 config = config.get_text_config() @@ -602,10 +622,16 @@ def main(args: argparse.Namespace): E = config.num_local_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size + hidden_size = config.hidden_size + enable_ep = bool(args.enable_expert_parallel) + if enable_ep: + ensure_divisibility(E, args.tp_size, "Number of experts") + E = E // args.tp_size + shard_intermediate_size = 2 * intermediate_size + else: + ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") shard_intermediate_size = 2 * intermediate_size // args.tp_size - - hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) @@ -665,7 +691,11 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) search_space = get_configs_compute_bound(is_fp16, block_quant_shape) print(f"Start tuning over {len(search_space)} configurations...") - + if use_deep_gemm: + raise ValueError( + "Tuning with --use-deep-gemm is not supported as it only tunes Triton " + "kernels. Please remove the flag." + ) start = time.time() configs = _distribute( "tune", @@ -699,6 +729,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: use_fp8_w8a8, use_int8_w8a16, block_quant_shape, + args.save_dir, ) end = time.time() print(f"Tuning took {end - start:.2f} seconds") @@ -735,10 +766,14 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: parser.add_argument( "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 ) + parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true") parser.add_argument( "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" ) parser.add_argument("--use-deep-gemm", action="store_true") + parser.add_argument( + "--save-dir", type=str, default="./", help="Directory to save tuned results" + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--batch-size", type=int, nargs="+", required=False) parser.add_argument("--tune", action="store_true") diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py index 1af5a21caf46..f540cff6261a 100644 --- a/benchmarks/kernels/benchmark_moe_align_block_size.py +++ b/benchmarks/kernels/benchmark_moe_align_block_size.py @@ -5,9 +5,8 @@ import torch -from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( - moe_align_block_size_triton, + moe_align_block_size, ) from vllm.triton_utils import triton @@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor: ) -def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8): - """ - Verifies vllm vs. Triton - """ - topk_ids = get_topk_ids(num_tokens, num_experts, topk) - - # 1. malloc space for triton and vllm - # malloc enough space (max_num_tokens_padded) for the sorted ids - max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) - sorted_ids_triton = torch.empty( - (max_num_tokens_padded,), dtype=torch.int32, device="cuda" - ) - expert_ids_triton = torch.empty( - (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda" - ) - num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda") - - sorted_ids_vllm = torch.empty_like(sorted_ids_triton) - expert_ids_vllm = torch.empty_like(expert_ids_triton) - num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton) - - # 2. run implementations - moe_align_block_size_triton( - topk_ids, - num_experts, - block_size, - sorted_ids_triton, - expert_ids_triton, - num_tokens_post_pad_triton, - ) - - ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids_vllm, - expert_ids_vllm, - num_tokens_post_pad_vllm, - ) - print(f"✅ VLLM implementation works with {num_experts} experts!") - - # 3. compare results - if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose( - num_tokens_post_pad_triton, num_tokens_post_pad_vllm - ): - print("✅ Triton and VLLM implementations match.") - else: - print("❌ Triton and VLLM implementations DO NOT match.") - print("Triton expert_ids:", expert_ids_triton) - print("VLLM expert_ids:", expert_ids_vllm) - print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton) - print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm) - - # test configurations num_tokens_range = [1, 16, 256, 4096] num_experts_range = [16, 64, 224, 256, 280, 512] @@ -87,8 +32,8 @@ def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8): x_names=["num_tokens", "num_experts", "topk"], x_vals=configs, line_arg="provider", - line_vals=["vllm", "triton"], # "triton" - line_names=["VLLM", "Triton"], # "Triton" + line_vals=["vllm"], + line_names=["vLLM"], plot_name="moe-align-block-size-performance", args={}, ) @@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider): block_size = 256 topk_ids = get_topk_ids(num_tokens, num_experts, topk) - max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) - sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda") - max_num_m_blocks = max_num_tokens_padded // block_size - expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda") - num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda") - quantiles = [0.5, 0.2, 0.8] if provider == "vllm": ms, min_ms, max_ms = triton.testing.do_bench( - lambda: ops.moe_align_block_size( - topk_ids, - num_experts, - block_size, - sorted_ids.clone(), - expert_ids.clone(), - num_tokens_post_pad.clone(), - ), - quantiles=quantiles, - ) - elif provider == "triton": - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: moe_align_block_size_triton( - topk_ids, - num_experts, - block_size, - sorted_ids.clone(), - expert_ids.clone(), - num_tokens_post_pad.clone(), - ), + lambda: moe_align_block_size(topk_ids, block_size, num_experts), quantiles=quantiles, ) @@ -151,6 +71,4 @@ def benchmark(num_tokens, num_experts, topk, provider): ) args = parser.parse_args() - print("Running correctness check...") - check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk) benchmark.run(print_data=True, show_plots=True) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 4ed690090144..459eafa6d907 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -8,12 +8,13 @@ import torch from transformers import AutoConfig -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( +from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( _moe_permute, _moe_unpermute_and_reduce, + moe_permute, + moe_unpermute, ) -from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import * from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform from vllm.utils import FlexibleArgumentParser @@ -63,18 +64,19 @@ def prepare(i: int): def run(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( - moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + ( + permuted_hidden_states, + a1q_scale, + first_token_off, + inv_perm_idx, + m_indices, + ) = moe_permute( + qhidden_states, + a1q_scale=None, + topk_ids=topk_ids, + n_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, ) else: ( @@ -150,18 +152,19 @@ def benchmark_unpermute( def prepare(): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = ( - moe_permute( - qhidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - token_expert_indices=token_expert_indices, - topk=topk, - n_expert=num_experts, - n_local_expert=num_experts, - expert_map=None, - align_block_size=align_block_size, - ) + ( + permuted_hidden_states, + a1q_scale, + first_token_off, + inv_perm_idx, + m_indices, + ) = moe_permute( + qhidden_states, + a1q_scale=None, + topk_ids=topk_ids, + n_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, ) # convert to fp16/bf16 as gemm output return ( @@ -191,16 +194,19 @@ def prepare(): def run(input: tuple): if use_customized_permute: - (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input + ( + permuted_hidden_states, + first_token_off, + inv_perm_idx, + m_indices, + ) = input + output = torch.empty_like(hidden_states) moe_unpermute( + output, permuted_hidden_states, topk_weights, - topk_ids, inv_perm_idx, first_token_off, - topk, - num_experts, - num_experts, ) else: ( @@ -211,7 +217,11 @@ def run(input: tuple): inv_perm, ) = input _moe_unpermute_and_reduce( - output_hidden_states, permuted_hidden_states, inv_perm, topk_weights + output_hidden_states, + permuted_hidden_states, + inv_perm, + topk_weights, + True, ) # JIT compilation & warmup @@ -334,7 +344,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" use_customized_permute = args.use_customized_permute diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py new file mode 100644 index 000000000000..b9147361708f --- /dev/null +++ b/benchmarks/kernels/benchmark_mrope.py @@ -0,0 +1,328 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models). +# It generates test data, runs benchmarks, and saves results to a CSV file. +# +# The CSV file (named with current date/time) contains these columns: +# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, +# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, +# speedup +# +# == Usage Examples == +# +# Single model benchmark: +# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \ +# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models benchmark: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models with different TP sizes: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 +# +# All models with different token counts: +# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \ +# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384 +import csv +import os +import time +from datetime import datetime +from typing import Any + +import numpy as np +import torch + +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config +from vllm.utils import FlexibleArgumentParser + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def generate_test_data( + num_tokens: int, + num_q_heads: int, + num_kv_heads: int, + head_size: int, + max_position_embeddings: int, + dtype: torch.dtype, + device: torch.device, +): + """Generate test data for given configuration.""" + # Create 2D positions (3, num_tokens) for multimodal case + positions = torch.randint( + 0, max_position_embeddings // 4, (3, num_tokens), device=device + ) + + # Create query and key tensors + query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device) + key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device) + + return positions, query, key + + +def calculate_stats(times: list[float]) -> dict[str, float]: + """Calculate statistics from a list of times.""" + times_array = np.array(times) + return { + "mean": np.mean(times_array), + "median": np.median(times_array), + "p99": np.percentile(times_array, 99), + "min": np.min(times_array), + "max": np.max(times_array), + } + + +def benchmark_mrope( + model_name: str, + num_tokens: int, + head_dim: int, + tp_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 8192, + rope_theta: float = 10000, + is_neox_style: bool = True, + rope_scaling: dict[str, Any] = None, + dtype: torch.dtype = torch.bfloat16, + seed: int = 0, + warmup_iter: int = 10, + benchmark_iter: int = 100, + csv_writer=None, +): + current_platform.seed_everything(seed) + torch.set_default_device(device) + # the parameters to compute the q k v size based on tp_size + mrope_helper_class = get_rope( + head_size=head_dim, + rotary_dim=head_dim, + max_position=max_position, + base=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=rope_scaling, + dtype=dtype, + ).to(device=device) + + print(80 * "=") + print( + f"Evaluating model: {model_name} " + f"with tp_size: {tp_size} " + f"and num_tokens: {num_tokens}, " + f"dtype: {dtype}" + ) + + # create q k v input tensors + # create rotary pos emb input tensors + positions, query, key = generate_test_data( + num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device + ) + + # Warm up + for _ in range(warmup_iter): + mrope_helper_class.forward_native( + positions, + query.clone(), + key.clone(), + ) + + mrope_helper_class.forward_cuda( + positions, + query.clone(), + key.clone(), + ) + + torch.cuda.synchronize() + + # Time reference implementation + torch_times = [] + for _ in range(benchmark_iter): + query_clone = query.clone() + key_clone = key.clone() + torch.cuda.synchronize() + start_time = time.time() + + mrope_helper_class.forward_native( + positions, + query_clone, + key_clone, + ) + + torch.cuda.synchronize() + torch_times.append(time.time() - start_time) + + # Time triton kernel implementation + triton_times = [] + for _ in range(benchmark_iter): + query_clone = query.clone() + key_clone = key.clone() + torch.cuda.synchronize() + start_time = time.time() + mrope_helper_class.forward_cuda( + positions, + query_clone, + key_clone, + ) + torch.cuda.synchronize() + triton_times.append(time.time() - start_time) + + # Calculate statistics + torch_stats = calculate_stats(torch_times) + triton_stats = calculate_stats(triton_times) + print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):") + + print( + f"Torch implementation: " + f"mean={torch_stats['mean']:.8f}s, " + f"median={torch_stats['median']:.8f}s, " + f"p99={torch_stats['p99']:.8f}s" + ) + + print( + f"Triton implementation: " + f"mean={triton_stats['mean']:.8f}s, " + f"median={triton_stats['median']:.8f}s, " + f"p99={triton_stats['p99']:.8f}s" + ) + + print( + f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x" + ) + + # Write to CSV + if csv_writer: + row = [ + model_name, + tp_size, + num_tokens, + num_heads, + num_kv_heads, + head_dim, + max_position, + rope_theta, + is_neox_style, + str(rope_scaling), + str(dtype).split(".")[-1], + torch_stats["mean"], + torch_stats["median"], + torch_stats["p99"], + torch_stats["min"], + torch_stats["max"], + triton_stats["mean"], + triton_stats["median"], + triton_stats["p99"], + triton_stats["min"], + triton_stats["max"], + torch_stats["mean"] / triton_stats["mean"], # speedup + ] + csv_writer.writerow(row) + + return torch_stats, triton_stats + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the rotary embedding kernels." + ) + parser.add_argument("--model-name", type=str, default="") + parser.add_argument("--tp-size", type=int, default=1) + parser.add_argument("--warmup-iter", type=int, default=10) + parser.add_argument("--benchmark-iter", type=int, default=100) + parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--num-tokens", type=int, nargs="+", required=False) + parser.add_argument("--trust-remote-code", action="store_true") + parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv") + args = parser.parse_args() + print(args) + + # Create CSV file for results + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv" + + with open(csv_filename, "w", newline="") as csvfile: + csv_writer = csv.writer(csvfile) + # Write header + header = [ + "model_name", + "tp_size", + "num_tokens", + "num_heads", + "num_kv_heads", + "head_dim", + "max_position", + "rope_theta", + "is_neox_style", + "rope_scaling", + "dtype", + "torch_mean", + "torch_median", + "torch_p99", + "torch_min", + "torch_max", + "triton_mean", + "triton_median", + "triton_p99", + "triton_min", + "triton_max", + "speedup", + ] + csv_writer.writerow(header) + + model_tp_dict = {} + if args.model_name == "": + model_tp_dict = { + "Qwen/Qwen2-VL-2B-Instruct": [1], + "Qwen/Qwen2-VL-7B-Instruct": [1], + "Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8], + "Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8], + "Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8], + "Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8], + } + else: + model_tp_dict[args.model_name] = [args.tp_size] + + if args.num_tokens is None: + num_tokens_list = [2**i for i in range(0, 18)] + else: + num_tokens_list = args.num_tokens + + for model_name, tp_list in model_tp_dict.items(): + config = get_config(model_name, trust_remote_code=args.trust_remote_code) + for tp_size in tp_list: + # get the model config + total_num_kv_heads = config.num_key_value_heads + total_num_heads = config.num_attention_heads + num_heads = total_num_heads // tp_size + num_kv_heads = max(1, total_num_kv_heads // tp_size) + head_dim = config.hidden_size // total_num_heads + q_size = num_heads * head_dim + kv_size = num_kv_heads * head_dim + is_neox_style = True + rope_theta = config.rope_theta + max_position = config.max_position_embeddings + + for num_tokens in num_tokens_list: + benchmark_mrope( + model_name=model_name, + num_tokens=num_tokens, + head_dim=head_dim, + tp_size=tp_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + max_position=max_position, + rope_theta=rope_theta, + is_neox_style=is_neox_style, + rope_scaling=config.rope_scaling, + dtype=getattr(torch, args.dtype), + seed=args.seed, + warmup_iter=args.warmup_iter, + benchmark_iter=args.benchmark_iter, + csv_writer=csv_writer, + ) + + print(f"Benchmark results saved to {csv_filename}") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 7e0376c18ecc..1b1e71adeec4 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -3,16 +3,15 @@ import random import time -from typing import Optional import torch from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import ( +from vllm.utils import FlexibleArgumentParser +from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, - FlexibleArgumentParser, create_kv_caches_with_random, ) @@ -37,7 +36,7 @@ def main( seed: int, do_profile: bool, device: str = "cuda", - kv_cache_dtype: Optional[str] = None, + kv_cache_dtype: str | None = None, ) -> None: current_platform.seed_everything(seed) diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py new file mode 100644 index 000000000000..bdc1eb733084 --- /dev/null +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import math +from collections.abc import Callable +from contextlib import contextmanager +from unittest.mock import patch + +import torch + +from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils +from vllm.platforms import current_platform + + +@contextmanager +def _triton_mode(): + """Temporarily force the Triton fallback path""" + with patch("vllm.platforms.current_platform.is_cuda", return_value=False): + yield + + +def _time_cuda( + fn: Callable[[], tuple[torch.Tensor, torch.Tensor]], + warmup_iters: int, + bench_iters: int, +) -> float: + # warmup + for _ in range(warmup_iters): + fn() + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + for _ in range(bench_iters): + fn() + end.record() + torch.cuda.synchronize() + + return start.elapsed_time(end) / bench_iters # ms/iter + + +def _run_single( + shape: tuple[int, int], + group_size: int, + dtype: str, + *, + column_major: bool = False, + scale_ue8m0: bool = False, + warmup_iters: int, + bench_iters: int, +) -> None: + num_tokens, hidden_dim = shape + + device = torch.device("cuda") + torch.manual_seed(42) + x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8 + + if dtype == "fp8": + + def cuda_impl(): + return fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + + def triton_impl(): + with _triton_mode(): + return fp8_utils.per_token_group_quant_fp8( + x, + group_size, + column_major_scales=column_major, + use_ue8m0=scale_ue8m0, + ) + elif dtype == "int8": + + def cuda_impl(): + return int8_utils.per_token_group_quant_int8(x, group_size) + + def triton_impl(): + with _triton_mode(): + return int8_utils.per_token_group_quant_int8(x, group_size) + else: + raise ValueError("dtype must be 'fp8' or 'int8'") + + cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters) + triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters) + + speedup = triton_ms / cuda_ms if cuda_ms else math.inf + + cfg_desc = ( + f"shape={shape} gs={group_size:<3} col_major={column_major:<5} " + f"ue8m0={scale_ue8m0:<5} dtype={dtype}" + ) + print( + f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms | Triton {triton_ms:7.3f} ms | " + f"speed-up ×{speedup:5.2f}" + ) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--warmup-iters", type=int, default=10) + parser.add_argument("--bench-iters", type=int, default=100) + parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both") + return parser.parse_args() + + +if __name__ == "__main__": + if not current_platform.is_cuda(): + raise RuntimeError("CUDA device is required to run this benchmark.") + + args = parse_args() + warmup_iters, bench_iters = args.warmup_iters, args.bench_iters + + shapes = [(32, 128), (64, 256), (16, 512)] + group_sizes = [64, 128] + + dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype] + + header = ( + "Configuration".ljust(55) + + " | " + + "CUDA (ms)".center(12) + + " | " + + "Triton (ms)".center(13) + + " | " + + "Speed-up" + ) + print(header) + print("-" * len(header)) + + for dtype in dtypes: + for shape in shapes: + for gs in group_sizes: + if dtype == "fp8": + for col_major in (False, True): + for ue8m0 in (False, True): + _run_single( + shape, + gs, + dtype, + column_major=col_major, + scale_ue8m0=ue8m0, + warmup_iters=warmup_iters, + bench_iters=bench_iters, + ) + else: # INT8 has no col-major / ue8m0 switches + _run_single( + shape, + gs, + dtype, + warmup_iters=warmup_iters, + bench_iters=bench_iters, + ) diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 6ab26f5f1adf..61427a77b4e3 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -7,7 +7,8 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE @torch.inference_mode() diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py new file mode 100644 index 000000000000..e0ff09d4b397 --- /dev/null +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import time + +import torch +from tabulate import tabulate + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser +from vllm.utils.torch_utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + create_kv_caches_with_random, +) + +logger = init_logger(__name__) + + +@torch.inference_mode() +def run_benchmark( + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + kv_cache_dtype: str, + num_iters: int, + benchmark_mode: str, + device: str = "cuda", +) -> float: + """Return latency (seconds) for given num_tokens.""" + + if kv_cache_dtype == "fp8" and head_size % 16: + raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") + + current_platform.seed_everything(42) + torch.set_default_device(device) + + # create random key / value tensors [T, H, D]. + key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) + value = torch.randn_like(key) + + # prepare the slot mapping. + # each token is assigned a unique slot in the KV-cache. + num_slots = block_size * num_blocks + if num_tokens > num_slots: + raise ValueError("num_tokens cannot exceed the total number of cache slots") + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) + + key_caches, value_caches = create_kv_caches_with_random( + num_blocks, + block_size, + 1, # num_layers + num_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + # to free unused memory + del key_caches, value_caches + + # compute per-kernel scaling factors for fp8 conversion (if used). + k_scale = (key.amax() / 64.0).to(torch.float32) + v_scale = (value.amax() / 64.0).to(torch.float32) + + function_under_test = lambda: ops.reshape_and_cache( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + + if benchmark_mode == "cudagraph": + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + function_under_test() + torch.cuda.synchronize() + function_under_test = lambda: g.replay() + + def run_cuda_benchmark(n_iters: int) -> float: + nonlocal key, value, key_cache, value_cache, slot_mapping + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(n_iters): + function_under_test() + torch.cuda.synchronize() + end = time.perf_counter() + return (end - start) / n_iters + + # warm-up + run_cuda_benchmark(3) + + lat = run_cuda_benchmark(num_iters) + + # free tensors to mitigate OOM when sweeping + del key, value, key_cache, value_cache, slot_mapping + torch.cuda.empty_cache() + + return lat + + +def main(args): + rows = [] + for exp in range(1, 17): + n_tok = 2**exp + lat = run_benchmark( + num_tokens=n_tok, + num_heads=args.num_heads, + head_size=args.head_size, + block_size=args.block_size, + num_blocks=args.num_blocks, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + kv_cache_dtype=args.kv_cache_dtype, + num_iters=args.iters, + benchmark_mode=args.mode, + device="cuda", + ) + rows.append([n_tok, lat * 1e6]) # convert to microseconds + + print(f"Benchmark results for implementation cuda (measuring with {args.mode}):") + print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f")) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + + parser.add_argument("--num-heads", type=int, default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--num-blocks", type=int, default=128 * 128) + + parser.add_argument( + "--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="bfloat16", + ) + + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8"], + default="auto", + ) + + parser.add_argument("--iters", type=int, default=200) + + parser.add_argument( + "--mode", + type=str, + choices=["cudagraph", "no_graph"], + default="cudagraph", + ) + + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py new file mode 100644 index 000000000000..29f1b2ccdcf6 --- /dev/null +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import time + +import torch +from tabulate import tabulate + +from vllm import _custom_ops as ops +from vllm.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser +from vllm.utils.torch_utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + create_kv_caches_with_random_flash, +) + +logger = init_logger(__name__) + + +@torch.inference_mode() +def run_benchmark( + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + kv_cache_dtype: str, + kv_cache_layout: str, + num_iters: int, + implementation: str, + benchmark_mode: str, + device: str = "cuda", +) -> float: + """Return latency (seconds) for given num_tokens.""" + + if kv_cache_dtype == "fp8" and head_size % 16: + raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") + + if implementation not in ("cuda", "triton"): + raise ValueError( + f"Unsupported implementation: {implementation}. " + "Only 'cuda' and 'triton' are supported." + ) + if implementation == "triton" and kv_cache_layout == "HND": + return float("nan") # Triton does not support HND layout yet. + + current_platform.seed_everything(42) + torch.set_default_device(device) + + # create random key / value tensors [T, H, D]. + key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) + value = torch.randn_like(key) + + # prepare the slot mapping. + # each token is assigned a unique slot in the KV-cache. + num_slots = block_size * num_blocks + if num_tokens > num_slots: + raise ValueError("num_tokens cannot exceed the total number of cache slots") + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) + + key_caches, value_caches = create_kv_caches_with_random_flash( + num_blocks, + block_size, + 1, # num_layers + num_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + cache_layout=kv_cache_layout, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + # to free unused memory + del key_caches, value_caches + + # compute per-kernel scaling factors for fp8 conversion (if used). + k_scale = (key.amax() / 64.0).to(torch.float32) + v_scale = (value.amax() / 64.0).to(torch.float32) + + if implementation == "cuda": + function_under_test = lambda: ops.reshape_and_cache_flash( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + else: + function_under_test = lambda: triton_reshape_and_cache_flash( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + if benchmark_mode == "cudagraph": + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + function_under_test() + torch.cuda.synchronize() + function_under_test = lambda: g.replay() + + def run_cuda_benchmark(n_iters: int) -> float: + nonlocal key, value, key_cache, value_cache, slot_mapping + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(n_iters): + function_under_test() + torch.cuda.synchronize() + end = time.perf_counter() + return (end - start) / n_iters + + # warm-up + run_cuda_benchmark(3) + + lat = run_cuda_benchmark(num_iters) + + # free tensors to mitigate OOM when sweeping + del key, value, key_cache, value_cache, slot_mapping + torch.cuda.empty_cache() + + return lat + + +def main(args): + rows = [] + for layout in ["NHD", "HND"]: + for exp in range(1, 17): + n_tok = 2**exp + lat = run_benchmark( + num_tokens=n_tok, + num_heads=args.num_heads, + head_size=args.head_size, + block_size=args.block_size, + num_blocks=args.num_blocks, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + kv_cache_dtype=args.kv_cache_dtype, + kv_cache_layout=layout, + num_iters=args.iters, + implementation=args.implementation, + benchmark_mode=args.mode, + device="cuda", + ) + rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) + + print( + f"Benchmark results for implementation {args.implementation}" + f" (measuring with {args.mode}):" + ) + print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + + parser.add_argument("--num-heads", type=int, default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--num-blocks", type=int, default=128 * 512) + + parser.add_argument( + "--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="bfloat16", + ) + + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8"], + default="auto", + ) + + parser.add_argument("--iters", type=int, default=100) + + parser.add_argument( + "--implementation", + type=str, + choices=["cuda", "triton"], + default="cuda", + ) + + parser.add_argument( + "--mode", + type=str, + choices=["cudagraph", "no_graph"], + default="cudagraph", + ) + + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index 4cf633a81358..d8d7f5bcf9da 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Optional, Union import torch from flashinfer.norm import fused_add_rmsnorm, rmsnorm @@ -21,8 +20,8 @@ def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: def forward( self, x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: orig_dtype = x.dtype x = x.to(torch.float32) if residual is not None: @@ -41,7 +40,7 @@ def forward( def rmsnorm_naive( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) @@ -65,7 +64,7 @@ def rmsnorm_naive( def rmsnorm_flashinfer( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape @@ -89,7 +88,7 @@ def rmsnorm_flashinfer( def rmsnorm_vllm( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index b81baf17a8c6..24869c91a8d7 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate -from typing import Optional import nvtx import torch @@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora( seq_len: int, num_heads: int, head_size: int, - rotary_dim: Optional[int], + rotary_dim: int | None, dtype: torch.dtype, seed: int, device: str, diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py new file mode 100644 index 000000000000..a5887aafd30d --- /dev/null +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -0,0 +1,720 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Comprehensive 3-way SiLU Benchmark Suite + +This benchmark compares three SiLU implementations: +1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation +2. Triton Kernel - Triton-based implementation + +The suite generates detailed performance comparisons including: +- Memory bandwidth utilization +- Speedup ratios (baseline vs optimized implementations) +- Performance across different expert configurations and token distributions +""" + +from collections.abc import Callable + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + persistent_masked_m_silu_mul_quant, +) +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used + + +@triton.jit +def _silu_mul_fp8_quant_deep_gemm( + # Pointers ------------------------------------------------------------ + input_ptr, # 16-bit activations (E, T, 2*H) + y_q_ptr, # fp8 quantized activations (E, T, H) + y_s_ptr, # 16-bit scales (E, T, G) + counts_ptr, # int32 num tokens per expert (E) + # Sizes --------------------------------------------------------------- + H: tl.constexpr, # hidden dimension (per output) + GROUP_SIZE: tl.constexpr, # elements per group (usually 128) + # Strides for input (elements) --------------------------------------- + stride_i_e, + stride_i_t, + stride_i_h, + # Strides for y_q (elements) ----------------------------------------- + stride_yq_e, + stride_yq_t, + stride_yq_h, + # Strides for y_s (elements) ----------------------------------------- + stride_ys_e, + stride_ys_t, + stride_ys_g, + # Stride for counts (elements) + stride_counts_e, + # Numeric params ------------------------------------------------------ + eps: tl.constexpr, + fp8_min: tl.constexpr, + fp8_max: tl.constexpr, + use_ue8m0: tl.constexpr, + # Meta --------------------------------------------------------------- + BLOCK: tl.constexpr, + NUM_STAGES: tl.constexpr, +): + G = H // GROUP_SIZE + + # map program id -> (e, g) + pid = tl.program_id(0) + e = pid // G + g = pid % G + + e = e.to(tl.int64) + g = g.to(tl.int64) + + # number of valid tokens for this expert + n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64) + + cols = tl.arange(0, BLOCK).to(tl.int64) + mask = cols < BLOCK + + base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h + base_gate_offset = base_input_offset + cols * stride_i_h + base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h + base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h + base_ys_offset = e * stride_ys_e + g * stride_ys_g + + for t in tl.range(0, n_tokens, num_stages=NUM_STAGES): + gate = tl.load( + input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0 + ).to(tl.float32) + up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0) + + gate = gate * (1.0 / (1.0 + tl.exp(-gate))) + y = gate * up + + y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max + if use_ue8m0: + y_s = tl.exp2(tl.ceil(tl.log2(y_s))) + + y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty) + + tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask) + tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s) + + +def silu_mul_fp8_quant_deep_gemm_triton( + y: torch.Tensor, # (E, T, 2*H) + tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert + num_parallel_tokens, + group_size: int = 128, + eps: float = 1e-10, + expert_offsets: torch.Tensor = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales + + y has shape (E, T, 2*H). The first half of the last dimension is + silu-activated, multiplied by the second half, then quantized into FP8. + + Returns `(y_q, y_s)` where + * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H] + * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T) + """ + assert y.ndim == 3, "y must be (E, T, 2*H)" + E, T, H2 = y.shape + assert H2 % 2 == 0, "last dim of y must be even (2*H)" + H = H2 // 2 + G = (H + group_size - 1) // group_size + assert H % group_size == 0, "H must be divisible by group_size" + assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, ( + "tokens_per_expert must be shape (E,)" + ) + tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32) + + # allocate outputs + fp8_dtype = torch.float8_e4m3fn + y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device) + + # strides (elements) + stride_i_e, stride_i_t, stride_i_h = y.stride() + stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride() + + # desired scale strides (elements): (T*G, 1, T) + stride_ys_e = T * G + stride_ys_t = 1 + stride_ys_g = T + y_s = torch.empty_strided( + (E, T, G), + (stride_ys_e, stride_ys_t, stride_ys_g), + dtype=torch.float32, + device=y.device, + ) + + stride_cnt_e = tokens_per_expert.stride()[0] + + # Static grid over experts and H-groups. + # A loop inside the kernel handles the token dim + grid = (E * G,) + + f_info = torch.finfo(fp8_dtype) + fp8_max = f_info.max + fp8_min = f_info.min + + _silu_mul_fp8_quant_deep_gemm[grid]( + y, + y_q, + y_s, + tokens_per_expert, + H, + group_size, + stride_i_e, + stride_i_t, + stride_i_h, + stride_yq_e, + stride_yq_t, + stride_yq_h, + stride_ys_e, + stride_ys_t, + stride_ys_g, + stride_cnt_e, + eps, + fp8_min, + fp8_max, + is_deep_gemm_e8m0_used(), + BLOCK=group_size, + NUM_STAGES=4, + num_warps=1, + ) + + return y_q, y_s + + +# Parse generation strategies +strategies = ["random_imbalanced", "uniform", "max_t"] + + +def benchmark( + kernel: Callable, + E: int, + T: int, + H: int, + total_tokens: int, + num_parallel_tokens: int = 64, + G: int = 128, + runs: int = 200, + num_warmups: int = 20, + gen_strategy: str = "default", + iterations_per_run: int = 20, +): + def generate_data(seed_offset=0): + """Generate input data with given seed offset""" + current_platform.seed_everything(42 + seed_offset) + y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() + + if gen_strategy == "random_imbalanced": + + def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"): + mean = total_tokens // n_e + min_max = mean // ratio + e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean + e[0] = min_max + r = torch.rand(size=(E - 1,)) + r /= r.sum() + r *= total_tokens - min_max + r = r.round().long() + e[1:] = r.to(device=device) + return e + + tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda") + elif gen_strategy == "uniform": + r = torch.rand(size=(E,)) + r /= r.sum() + r *= total_tokens + r = r.round().long() + tokens_per_expert = r + elif gen_strategy == "max_t": + tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert.fill_(total_tokens / E) + elif gen_strategy == "first_t": + tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda") + tokens_per_expert[0] = min(T, total_tokens) + else: + raise ValueError(f"Unknown generation strategy: {gen_strategy}") + return y, tokens_per_expert + + dataset_count = 4 + # Pre-generate different input matrices for each iteration to avoid cache effects + data_sets = [generate_data(i) for i in range(dataset_count)] + + # Warmup + y, tokens_per_expert = data_sets[0] + for _ in range(num_warmups): + kernel( + y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G + ) + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + # Benchmark + latencies: list[float] = [] + for _ in range(runs): + torch.cuda.synchronize() + + start_event.record() + for i in range(iterations_per_run): + y, tokens_per_expert = data_sets[i % dataset_count] + kernel( + y, + tokens_per_expert, + num_parallel_tokens=num_parallel_tokens, + group_size=G, + ) + end_event.record() + end_event.synchronize() + + total_time_ms = start_event.elapsed_time(end_event) + per_iter_time_ms = total_time_ms / iterations_per_run + latencies.append(per_iter_time_ms) + + # Use median instead of average for better outlier handling + median_time_ms = np.median(latencies) + median_time_s = median_time_ms / 1000 + + # Calculate actual work done (using first dataset for consistency) + _, tokens_per_expert = data_sets[0] + actual_tokens = tokens_per_expert.sum().item() + actual_elements = actual_tokens * H + + # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops + ops_per_element = 8 + total_ops = actual_elements * ops_per_element + gflops = total_ops / median_time_s / 1e9 + + # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes) + input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs + output_bytes = actual_tokens * H * 1 # H fp8 outputs + scale_bytes = actual_tokens * (H // G) * 4 # scales in float32 + total_bytes = input_bytes + output_bytes + scale_bytes + memory_bw = total_bytes / median_time_s / 1e9 + + HOPPER_BANDWIDTH_TBPS = 3.35 + return ( + median_time_ms, + gflops, + memory_bw, + (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100, + ) + + +def create_comparison_plot( + ratios, silu_v2_times, triton_times, config_labels, strategy_name, id +): + fig, ax = plt.subplots(1, 1, figsize=(18, 6)) + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.25 + + # Execution Time plot (lower is better) + ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue") + ax.bar( + x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green" + ) + + # Add speedup labels over each bar trio + for i in range(len(x)): + triton_v2_speedup = ratios[i][1] # triton/v2 + max_height = max(silu_v2_times[i], triton_times[i]) + + # Triton/V2 speedup + ax.text( + x[i] + width / 2, + max_height + max_height * 0.02, + f"{triton_v2_speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=8, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + return fig, ax + + +def create_combined_plot(all_results): + num_strategies = len(all_results) + fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies)) + + if num_strategies == 1: + axes = [axes] + + for idx, ( + strategy_name, + all_ratios, + all_silu_v2_results, + all_triton_results, + config_labels, + config_x_axis, + ) in enumerate(all_results): + ax = axes[idx] + + # Flatten the nested results to get bandwidth percentages for plotting + silu_v2_bandwidths = [] + triton_bandwidths = [] + flat_ratios = [] + + for config_results in all_silu_v2_results: + for result in config_results: + silu_v2_bandwidths.append(result[3]) # bandwidth percentage + + for config_results in all_triton_results: + for result in config_results: + triton_bandwidths.append(result[3]) # bandwidth percentage + + for config_ratios in all_ratios: + for ratio in config_ratios: + flat_ratios.append(ratio) + + # Configure x-axis positions + x = np.arange(len(config_labels)) + width = 0.25 + + # Bandwidth utilization plot (higher is better) + ax.bar( + x, + silu_v2_bandwidths, + width, + label="SiLU V2 (CUDA)", + alpha=0.8, + color="blue", + ) + ax.bar( + x + width, + triton_bandwidths, + width, + label="Triton Kernel", + alpha=0.8, + color="green", + ) + + # Add speedup labels over each bar trio + for i in range(len(x)): + triton_v2_speedup = flat_ratios[i] # triton/v2 + max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i]) + + # Triton/V2 speedup + ax.text( + x[i] + width / 2, + max_height + max_height * 0.02, + f"{triton_v2_speedup:.2f}x", + ha="center", + va="bottom", + fontweight="bold", + fontsize=8, + ) + + ax.set_xlabel("Configuration") + ax.set_ylabel("% Utilization") + ax.set_title( + f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)" + ) + ax.set_xticks(x) + ax.set_xticklabels(config_labels, rotation=45, ha="right") + ax.legend() + ax.grid(True, alpha=0.3) + + plt.tight_layout() + filename = "silu_benchmark_combined_3way.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +outer_dim = 7168 +configs = [ + # DeepSeekV3 Configs + # (1, 56, 7168), + (8, 1024, 7168), + # (32, 56, 7168), + # DeepSeekV3 Configs + (32, 1024, 7168), + # DeepSeekV3 Configs + (256, 1024, 7168), +] + +runs = 100 +num_warmups = 20 + +strategy_descriptions = { + "uniform": "Uniform Random", + "random_imbalanced": "Imbalanced Random", + "max_t": "Even Assignment", + "first_t": "experts[0] = T, experts[1:] = 0", +} + +print(f"GPU: {torch.cuda.get_device_name()}") +print(f"Testing strategies: {', '.join(strategies)}") +print(f"Configurations: {len(configs)} configs") + +all_results = [] + +# Run benchmarks for each strategy +for id, strategy in enumerate(strategies): + print(f"\n{'=' * 60}") + print(f"Testing strategy: {strategy_descriptions[strategy]}") + print(f"{'=' * 60}") + + # Collect benchmark data for all three algorithms + config_labels = [] + config_x_axis = [] + all_silu_v2_results = [] + all_triton_results = [] + all_ratios = [] + + for E, T, H in configs: + total_tokens_config = [] + for i in [8, 16, 32, 64, 128, 256, 512]: + if i <= T: + total_tokens_config.append(i * E) + config_x_axis.append(total_tokens_config) + + silu_v2_results = [] + triton_results = [] + ratios = [] + + for total_tokens in total_tokens_config: + config_label = f"E={E},T={T},H={H},TT={total_tokens}" + config_labels.append(config_label) + + # SiLU V2 (CUDA kernel) results + time_ms_silu_v2, gflops, gbps, perc = benchmark( + persistent_masked_m_silu_mul_quant, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc)) + + # Triton kernel results + time_ms_triton, gflops, gbps, perc = benchmark( + silu_mul_fp8_quant_deep_gemm_triton, + E, + T, + H, + total_tokens, + runs=runs, + num_warmups=num_warmups, + gen_strategy=strategy, + ) + triton_results.append((time_ms_triton, gflops, gbps, perc)) + + # Calculate speedup ratios (triton baseline / implementation) + triton_v2_ratio = time_ms_triton / time_ms_silu_v2 + ratios.append(triton_v2_ratio) + + print( + f"Completed: {config_label}:" + f" V2: {time_ms_silu_v2:.3f}ms," + f" Triton: {time_ms_triton:.3f}ms" + ) + + all_silu_v2_results.append(silu_v2_results) + all_triton_results.append(triton_results) + all_ratios.append(ratios) + + # Store results for combined plotting + all_results.append( + ( + strategy_descriptions[strategy], + all_ratios, + all_silu_v2_results, + all_triton_results, + config_labels, + config_x_axis, + ) + ) + + # Print summary table for this strategy + print(f"\nSummary Table - {strategy_descriptions[strategy]}:") + print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}") + print("-" * 90) + + for i, (E, T, H) in enumerate(configs): + # Get the first result for each config (simplifying for summary) + v2_time = silu_v2_results[i][0] + triton_time = triton_results[i][0] + triton_v2_speedup = triton_time / v2_time + config_label = f"E={E:3d},T={T:4d},H={H:4d}" + print( + f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} " + f"{triton_v2_speedup:8.2f}x" + ) + + +def create_total_tokens_plot(all_results): + num_strategies = len(all_results) + num_configs = len(configs) + + fig, axs = plt.subplots( + num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies) + ) + + # Add main title to the entire figure + fig.suptitle( + "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)", + fontsize=18, + fontweight="bold", + y=0.98, + ) + + # Handle single strategy case + if num_strategies == 1: + axs = axs.reshape(1, -1) + + # Handle single config case + if num_configs == 1: + axs = axs.reshape(-1, 2) + + for strategy_idx, result in enumerate(all_results): + ( + strategy_name, + all_ratios, + all_silu_v2_results, + all_triton_results, + config_labels, + config_x_axis, + ) = result + + for config_idx in range(num_configs): + # Speedup plot (left column) + ax_speedup = axs[strategy_idx, config_idx * 2] + # Bandwidth plot (right column) + ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1] + + E, T, H = configs[config_idx] + ratios = all_ratios[config_idx] + total_tokens_values = config_x_axis[config_idx] + + # Extract speedup ratios + triton_v2_ratios = [ratio for ratio in ratios] + + # Extract bandwidth percentages for all implementations + v2_bandwidth_percentages = [ + result[3] for result in all_silu_v2_results[config_idx] + ] + triton_bandwidth_percentages = [ + result[3] for result in all_triton_results[config_idx] + ] + + # Plot speedup ratios vs total tokens (left plot) + ax_speedup.plot( + total_tokens_values, + triton_v2_ratios, + "go-", + linewidth=3, + markersize=8, + label="Triton/V2 Speedup", + ) + ax_speedup.set_title( + f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) + ax_speedup.legend(prop={"weight": "bold"}) + ax_speedup.grid(True, alpha=0.3) + + # Plot bandwidth utilization (right plot) + ax_bandwidth.plot( + total_tokens_values, + v2_bandwidth_percentages, + "o-", + linewidth=3, + markersize=8, + label="SiLU V2", + color="blue", + ) + ax_bandwidth.plot( + total_tokens_values, + triton_bandwidth_percentages, + "o-", + linewidth=3, + markersize=8, + label="Triton", + color="green", + ) + ax_bandwidth.set_title( + f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", + fontsize=12, + fontweight="bold", + ) + ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) + ax_bandwidth.set_ylabel( + "% of Peak Bandwidth", fontweight="bold", fontsize=11 + ) + ax_bandwidth.legend(prop={"weight": "bold"}) + ax_bandwidth.grid(True, alpha=0.3) + + # Format x-axis labels for both plots + for ax in [ax_speedup, ax_bandwidth]: + ax.set_xticks(total_tokens_values) + ax.set_xticklabels( + [ + f"{tt // 1000}K" if tt >= 1000 else str(tt) + for tt in total_tokens_values + ], + fontweight="bold", + ) + # Make tick labels bold + for label in ax.get_xticklabels() + ax.get_yticklabels(): + label.set_fontweight("bold") + + # Add value labels on Triton/V2 speedup points + for x, y in zip(total_tokens_values, triton_v2_ratios): + ax_speedup.annotate( + f"{y:.2f}x", + (x, y), + textcoords="offset points", + xytext=(0, -15), + ha="center", + fontsize=9, + fontweight="bold", + bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3), + ) + + plt.tight_layout() + plt.subplots_adjust(top=0.93) # Make room for main title + filename = "silu_benchmark_total_tokens_3way.png" + plt.savefig(filename, dpi=300, bbox_inches="tight") + plt.show() + + return filename + + +# Create comprehensive 3-way comparison plots +combined_plot_filename = create_combined_plot(all_results) +total_tokens_plot_filename = create_total_tokens_plot(all_results) + +print(f"\n{'=' * 80}") +print("3-Way Benchmark Suite Complete!") +print(f"Generated combined comparison plot: {combined_plot_filename}") +print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}") +print("Compared: SiLU V2 (CUDA), and Triton implementations") +print(f"{'=' * 80}") diff --git a/benchmarks/kernels/benchmark_trtllm_attention.py b/benchmarks/kernels/benchmark_trtllm_attention.py deleted file mode 100644 index 8c980f930366..000000000000 --- a/benchmarks/kernels/benchmark_trtllm_attention.py +++ /dev/null @@ -1,240 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import csv -import os -import random -from datetime import datetime - -import flashinfer -import torch - -FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 - -# KV Cache Layout for TRT-LLM -# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim) - - -def to_float8(x, dtype=torch.float8_e4m3fn): - finfo = torch.finfo(dtype) - min_val, max_val = x.aminmax() - amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) - scale = finfo.max / amax * 0.1 - x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) - return x_scl_sat.to(dtype), scale.float().reciprocal() - - -@torch.no_grad() -def benchmark_decode( - num_seqs, - max_seq_len, - page_size=16, - dtype=torch.bfloat16, - kv_layout="HND", - num_kv_heads=8, - kv_cache_dtype="auto", - head_dim=128, - warmup=10, - trials=20, -): - torch.set_default_device("cuda") - device = "cuda" - torch.manual_seed(0) - - # Currently only HEAD_GRP_SIZE == 8 is supported - HEAD_GRP_SIZE = 8 - MAX_SEQ_LEN = max_seq_len - - # large number to reduce kv_cache reuse - NUM_BLOCKS = int(256000 / page_size) - - workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device) - - # For decode, batch_size is num_decode_token - num_qo_heads = num_kv_heads * HEAD_GRP_SIZE - sm_scale = float(1.0 / (head_dim**0.5)) - q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype) - kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] - - max_kv_len = max(kv_lens) - kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device) - max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size - - block_tables = torch.randint( - 0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32 - ) - - kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim) - kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype) - k_scale = v_scale = 1.0 - - if kv_cache_dtype.startswith("fp8"): - kv_cache, _ = to_float8(kv_cache) - - # Benchmark TRT decode - def trt_decode(): - return flashinfer.decode.trtllm_batch_decode_with_kv_cache( - q, - kv_cache, - workspace_buffer, - num_qo_heads, - num_kv_heads, - sm_scale, - block_tables, - kv_lens_tensor, - page_size, - max_kv_len, - kv_cache_dtype, - k_scale, - v_scale, - ) - - def time_fn(fn, warmup=10, trials=20): - torch.cuda.synchronize() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - times = [] - for i in range(warmup): - fn() - for i in range(trials): - start.record() - fn() - end.record() - torch.cuda.synchronize() - times.append(start.elapsed_time(end)) # ms - return sum(times) / len(times), torch.std(torch.tensor(times)) - - # TRT Decode - trt_mean, trt_std = time_fn(trt_decode) - - kv_indptr = [0] - kv_indices = [] - kv_last_page_lens = [] - for i in range(num_seqs): - seq_len = kv_lens[i] - assert seq_len > 0 - num_blocks = (seq_len + page_size - 1) // page_size - kv_indices.extend(block_tables[i, :num_blocks]) - kv_indptr.append(kv_indptr[-1] + num_blocks) - kv_last_page_len = seq_len % page_size - if kv_last_page_len == 0: - kv_last_page_len = page_size - kv_last_page_lens.append(kv_last_page_len) - - kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) - kv_indices = torch.tensor(kv_indices, dtype=torch.int32) - kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) - - wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( - workspace_buffer, - kv_layout, - use_tensor_cores=((num_qo_heads // num_kv_heads) > 4), - ) - - wrapper.plan( - kv_indptr, - kv_indices, - kv_last_page_lens, - num_qo_heads, - num_kv_heads, - head_dim, - page_size, - "NONE", - q_data_type=dtype, - kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype, - ) - - def baseline_decode(): - return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale) - - baseline_mean, baseline_std = time_fn(baseline_decode) - - # Calculate percentage speedup (positive means TRT is faster) - speedup_percent = (baseline_mean - trt_mean) / baseline_mean - - print( - f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}" - f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}" - ) - - # Return results for CSV writing - return { - "num_seqs": num_seqs, - "trt_mean": trt_mean, - "trt_std": trt_std.item(), - "baseline_mean": baseline_mean, - "baseline_std": baseline_std.item(), - "speedup_percent": speedup_percent, - "q_dtype": str(dtype), - "kv_cache_dtype": kv_cache_dtype, - "page_size": page_size, - "num_kv_heads": num_kv_heads, - "head_dim": head_dim, - "max_seq_len": max_seq_len, - } - - -def write_results_to_csv(results, filename=None): - """Write benchmark results to CSV file.""" - if filename is None: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" - - fieldnames = [ - "num_seqs", - "trt_mean", - "trt_std", - "baseline_mean", - "baseline_std", - "speedup_percent", - "q_dtype", - "kv_cache_dtype", - "page_size", - "num_kv_heads", - "head_dim", - "max_seq_len", - ] - - file_exists = os.path.exists(filename) - - with open(filename, "a", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - - if not file_exists: - writer.writeheader() - - for result in results: - writer.writerow(result) - - print(f"Results written to {filename}") - - -if __name__ == "__main__": - num_seqs = [1, 4, 8, 16, 32, 64, 128, 256] - max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] - all_results = [] - - print("Running benchmark for kv_cache_dtype: bfloat16") - print( - "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent" - ) - for max_seq_len in max_seq_lens: - for bs in num_seqs: - result = benchmark_decode( - bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto" - ) - all_results.append(result) - - print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8") - print( - "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent" - ) - for max_seq_len in max_seq_lens: - for bs in num_seqs: - result = benchmark_decode( - bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8" - ) - all_results.append(result) - - # Write all results to CSV - write_results_to_csv(all_results) diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py new file mode 100644 index 000000000000..f7cdc25794ca --- /dev/null +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -0,0 +1,290 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import csv +import os +from datetime import datetime + +import flashinfer +import torch + +from vllm.utils import round_up + +FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 +FP8_DTYPE = torch.float8_e4m3fn +FP4_DTYPE = torch.uint8 + + +def to_float8(x, dtype=torch.float8_e4m3fn): + finfo = torch.finfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax * 0.1 + x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) + return x_scl_sat.to(dtype), scale.float().reciprocal() + + +@torch.no_grad() +def benchmark_decode( + dtype: torch.dtype, + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], + batch_size: int, + max_seq_len: int, + num_heads: tuple[int, int] = (64, 8), + head_size: int = 128, + kv_layout: str = "HND", + block_size: int = 16, + warmup: int = 10, + trials: int = 20, +): + torch.set_default_device("cuda") + torch.manual_seed(0) + + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + num_qo_heads, num_kv_heads = num_heads + assert num_qo_heads % num_kv_heads == 0 + + sm_scale = float(1.0 / (head_size**0.5)) + + # large number to reduce kv_cache reuse + NUM_BLOCKS = int(256000 / block_size) + + kv_cache_shape = None + if kv_layout == "NHD": + kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) + elif kv_layout == "HND": + kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) + else: + raise ValueError(f"Invalid kv_layout: {kv_layout}") + + # Always using 1.0 scale to reflect the real perf in benchmarking + q_scale = 1.0 + ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype) + if q_quant_dtype == FP8_DTYPE: + query, _ = to_float8(ref_query) + else: + query = ref_query + + kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32) + kv_lens[-1] = max_seq_len + + seq_lens = kv_lens + max_seq_len = torch.max(seq_lens).item() + + # Always using 1.0 scale to reflect the real perf in benchmarking + k_scale = v_scale = 1.0 + ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype) + if kv_quant_dtype == FP8_DTYPE: + kv_cache, _ = to_float8(ref_kv_cache) + else: + kv_cache = ref_kv_cache + + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables = torch.randint( + 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32 + ) + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(batch_size): + seq_len = seq_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8) + + wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( + workspace_buffer, + kv_layout, + use_tensor_cores=True, + ) + wrapper.plan( + kv_indptr, + kv_indices, + kv_last_page_lens, + num_qo_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + sm_scale=sm_scale, + q_data_type=dtype, + kv_data_type=dtype, + ) + + def time_fn(fn, warmup=10, trials=20): + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + times = [] + for i in range(warmup): + fn() + for i in range(trials): + start.record() + fn() + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) # ms + return sum(times) / len(times), torch.std(torch.tensor(times)) + + o_scale = 1.0 + o_sf_scale = None + output_baseline = torch.empty(ref_query.shape, dtype=dtype) + if o_quant_dtype == FP4_DTYPE: + o_sf_scale = 500.0 + output_trtllm = flashinfer.utils.FP4Tensor( + torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8), + torch.empty( + ( + round_up(query.shape[0], 128), + round_up(query.shape[1] * query.shape[2] // 16, 4), + ), + dtype=torch.float8_e4m3fn, + ), + ) + else: + output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype) + + def baseline_decode(): + return wrapper.run( + ref_query, + ref_kv_cache, + k_scale=k_scale, + v_scale=v_scale, + out=output_baseline, + ) + + def trtllm_decode(): + return flashinfer.decode.trtllm_batch_decode_with_kv_cache( + query=query, + kv_cache=kv_cache, + workspace_buffer=workspace_buffer, + block_tables=block_tables, + seq_lens=seq_lens, + max_seq_len=max_seq_len, + bmm1_scale=q_scale * k_scale * sm_scale, + bmm2_scale=v_scale / o_scale, + o_sf_scale=o_sf_scale, + out=output_trtllm, + ) + + baseline_mean, baseline_std = time_fn(baseline_decode) + trtllm_mean, trtllm_std = time_fn(trtllm_decode) + + # Calculate percentage speedup (positive means TRT is faster) + speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean + + print( + f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}" + f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}" + ) + + # Return results for CSV writing + return { + "batch_size": batch_size, + "trtllm_mean": trtllm_mean, + "trtllm_std": trtllm_std.item(), + "baseline_mean": baseline_mean, + "baseline_std": baseline_std.item(), + "speedup_percent": speedup_percent, + "q_dtype": str(q_quant_dtype), + "kv_cache_dtype": str(kv_quant_dtype), + "output_dtype": str(o_quant_dtype), + "block_size": block_size, + "num_kv_heads": num_kv_heads, + "head_size": head_size, + "max_seq_len": max_seq_len, + } + + +def write_results_to_csv(results, filename=None): + """Write benchmark results to CSV file.""" + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" + + fieldnames = [ + "batch_size", + "trtllm_mean", + "trtllm_std", + "baseline_mean", + "baseline_std", + "speedup_percent", + "q_dtype", + "kv_cache_dtype", + "output_dtype", + "block_size", + "num_kv_heads", + "head_size", + "max_seq_len", + ] + + file_exists = os.path.exists(filename) + + with open(filename, "a", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + if not file_exists: + writer.writeheader() + + for result in results: + writer.writerow(result) + + print(f"Results written to {filename}") + + +if __name__ == "__main__": + batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256] + max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] + all_results = [] + + dtype = torch.bfloat16 + quant_dtypes = [ + # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) + (None, None, None), + (None, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), + (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), + ] + + for quant_dtype in quant_dtypes: + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + print( + f"Running benchmark for q_dtype = {q_quant_dtype}, " + f"kv_cache_dtype: {kv_quant_dtype}, " + f"output_dtype: {o_quant_dtype}" + ) + print( + "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t" + "baseline_std\tspeedup_percent" + ) + for max_seq_len in max_seq_lens: + for bs in batch_sizes: + result = benchmark_decode( + dtype=dtype, + quant_dtypes=quant_dtype, + batch_size=bs, + max_seq_len=max_seq_len, + ) + all_results.append(result) + + # Write all results to CSV + write_results_to_csv(all_results) diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py new file mode 100644 index 000000000000..7993354475fc --- /dev/null +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -0,0 +1,305 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import csv +import os +from datetime import datetime + +import flashinfer +import torch + +from vllm.utils import round_up + +FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 +FP8_DTYPE = torch.float8_e4m3fn +FP4_DTYPE = torch.uint8 + + +def to_float8(x, dtype=torch.float8_e4m3fn): + finfo = torch.finfo(dtype) + min_val, max_val = x.aminmax() + amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) + scale = finfo.max / amax * 0.1 + x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max) + return x_scl_sat.to(dtype), scale.float().reciprocal() + + +@torch.no_grad() +def benchmark_prefill( + dtype: torch.dtype, + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], + batch_size: int, + max_seq_len: int, + num_heads: tuple[int, int] = (64, 8), + head_size: int = 128, + kv_layout: str = "HND", + block_size: int = 16, + warmup: int = 10, + trials: int = 20, +): + torch.set_default_device("cuda") + torch.manual_seed(0) + + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + max_q_len = max_kv_len = max_seq_len + + num_qo_heads, num_kv_heads = num_heads + assert num_qo_heads % num_kv_heads == 0 + + sm_scale = float(1.0 / (head_size**0.5)) + + # large number to reduce kv_cache reuse + NUM_BLOCKS = int(256000 / block_size) + + kv_cache_shape = None + if kv_layout == "NHD": + kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size) + elif kv_layout == "HND": + kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size) + else: + raise ValueError(f"Invalid kv_layout: {kv_layout}") + + q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32) + q_lens[-1] = max_q_len + q_indptr = torch.cat( + [ + torch.tensor([0], dtype=torch.int32), + torch.cumsum(q_lens, dim=0, dtype=torch.int32), + ] + ) + + # Always using 1.0 scale to reflect the real perf in benchmarking + q_scale = 1.0 + ref_query = torch.randn( + torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype + ) + if q_quant_dtype == FP8_DTYPE: + query, _ = to_float8(ref_query) + else: + query = ref_query + + kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32) + kv_lens[-1] = max_kv_len + + seq_lens = kv_lens + q_lens + max_seq_len = torch.max(seq_lens).item() + + # Always using 1.0 scale to reflect the real perf in benchmarking + k_scale = v_scale = 1.0 + ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype) + if kv_quant_dtype == FP8_DTYPE: + kv_cache, _ = to_float8(ref_kv_cache) + else: + kv_cache = ref_kv_cache + + max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size + block_tables = torch.randint( + 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32 + ) + kv_indptr = [0] + kv_indices = [] + kv_last_page_lens = [] + for i in range(batch_size): + seq_len = seq_lens[i] + assert seq_len > 0 + num_blocks = (seq_len + block_size - 1) // block_size + kv_indices.extend(block_tables[i, :num_blocks]) + kv_indptr.append(kv_indptr[-1] + num_blocks) + kv_last_page_len = seq_len % block_size + if kv_last_page_len == 0: + kv_last_page_len = block_size + kv_last_page_lens.append(kv_last_page_len) + + kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32) + kv_indices = torch.tensor(kv_indices, dtype=torch.int32) + kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32) + workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8) + + wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, kv_layout + ) + wrapper.plan( + q_indptr, + kv_indptr, + kv_indices, + kv_last_page_lens, + num_qo_heads, + num_kv_heads, + head_size, + block_size, + causal=True, + sm_scale=sm_scale, + q_data_type=dtype, + kv_data_type=dtype, + ) + + def time_fn(fn, warmup=10, trials=20): + torch.cuda.synchronize() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + times = [] + for i in range(warmup): + fn() + for i in range(trials): + start.record() + fn() + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) # ms + return sum(times) / len(times), torch.std(torch.tensor(times)) + + o_scale = 1.0 + o_sf_scale = None + output_baseline = torch.empty(ref_query.shape, dtype=dtype) + if o_quant_dtype == FP4_DTYPE: + o_sf_scale = 500.0 + output_trtllm = flashinfer.utils.FP4Tensor( + torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8), + torch.empty( + ( + round_up(query.shape[0], 128), + round_up(query.shape[1] * query.shape[2] // 16, 4), + ), + dtype=torch.float8_e4m3fn, + ), + ) + else: + output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype) + + def baseline_prefill(): + return wrapper.run( + ref_query, + ref_kv_cache, + k_scale=k_scale, + v_scale=v_scale, + out=output_baseline, + ) + + def trtllm_prefill(): + return flashinfer.prefill.trtllm_batch_context_with_kv_cache( + query=query, + kv_cache=kv_cache, + workspace_buffer=workspace_buffer, + block_tables=block_tables, + seq_lens=seq_lens, + max_q_len=max_q_len, + max_kv_len=max_seq_len, + bmm1_scale=q_scale * k_scale * sm_scale, + bmm2_scale=v_scale / o_scale, + batch_size=batch_size, + cum_seq_lens_q=q_indptr, + cum_seq_lens_kv=kv_indptr, + o_sf_scale=o_sf_scale, + out=output_trtllm, + ) + + baseline_mean, baseline_std = time_fn(baseline_prefill) + trtllm_mean, trtllm_std = time_fn(trtllm_prefill) + + # Calculate percentage speedup (positive means TRT is faster) + speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean + + print( + f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}" + f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}" + ) + + # Return results for CSV writing + return { + "batch_size": batch_size, + "trtllm_mean": trtllm_mean, + "trtllm_std": trtllm_std.item(), + "baseline_mean": baseline_mean, + "baseline_std": baseline_std.item(), + "speedup_percent": speedup_percent, + "q_dtype": str(q_quant_dtype), + "kv_cache_dtype": str(kv_quant_dtype), + "output_dtype": str(o_quant_dtype), + "block_size": block_size, + "num_kv_heads": num_kv_heads, + "head_size": head_size, + "max_seq_len": max_seq_len, + } + + +def write_results_to_csv(results, filename=None): + """Write benchmark results to CSV file.""" + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv" + + fieldnames = [ + "batch_size", + "trtllm_mean", + "trtllm_std", + "baseline_mean", + "baseline_std", + "speedup_percent", + "q_dtype", + "kv_cache_dtype", + "output_dtype", + "block_size", + "num_kv_heads", + "head_size", + "max_seq_len", + ] + + file_exists = os.path.exists(filename) + + with open(filename, "a", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + if not file_exists: + writer.writeheader() + + for result in results: + writer.writerow(result) + + print(f"Results written to {filename}") + + +if __name__ == "__main__": + batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256] + max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] + all_results = [] + + dtype = torch.bfloat16 + quant_dtypes = [ + # (q_quant_dtype, kv_quant_dtype, o_quant_dtype) + (None, None, None), + (FP8_DTYPE, FP8_DTYPE, None), + (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE), + (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE), + ] + + for quant_dtype in quant_dtypes: + q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype + q_quant_dtype = q_quant_dtype or dtype + kv_quant_dtype = kv_quant_dtype or dtype + o_quant_dtype = o_quant_dtype or dtype + + print( + f"Running benchmark for q_dtype = {q_quant_dtype}, " + f"kv_cache_dtype: {kv_quant_dtype}, " + f"output_dtype: {o_quant_dtype}" + ) + print( + "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t" + "baseline_std\tspeedup_percent" + ) + for max_seq_len in max_seq_lens: + for bs in batch_sizes: + result = benchmark_prefill( + dtype=dtype, + quant_dtypes=quant_dtype, + batch_size=bs, + max_seq_len=max_seq_len, + ) + all_results.append(result) + + # Write all results to CSV + write_results_to_csv(all_results) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 4fcdbadd65ec..602fad181074 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -11,13 +11,13 @@ from typing import Any import torch -import tqdm -import triton +from tqdm import tqdm from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - _w8a8_block_fp8_matmul, + _w8a8_triton_block_scaled_mm, ) from vllm.platforms import current_platform +from vllm.triton_utils import triton from vllm.utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) @@ -56,7 +56,7 @@ def w8a8_block_matmul( Bs: The per-block quantization scale for `B`. block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128]. - output_dytpe: The dtype of the returned tensor. + output_dtype: The dtype of the returned tensor. Returns: torch.Tensor: The result of matmul. @@ -83,7 +83,7 @@ def grid(META): ) if A.dtype == torch.float8_e4m3fn: - kernel = _w8a8_block_fp8_matmul + kernel = _w8a8_triton_block_scaled_mm else: raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") @@ -141,6 +141,7 @@ def get_weight_shapes(tp_size): # cannot TP total = [ (512 + 64, 7168), + (2112, 7168), ((128 + 64) * 128, 7168), (128 * (128 + 128), 512), (7168, 16384), diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md index 917e814010f8..41e68e047be8 100644 --- a/benchmarks/kernels/deepgemm/README.md +++ b/benchmarks/kernels/deepgemm/README.md @@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs. You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory: -``` +```bash git clone --recursive https://github.com/deepseek-ai/DeepGEMM cd DeepGEMM python setup.py install @@ -17,7 +17,7 @@ uv pip install -e . ## Usage -``` +```console python benchmark_fp8_block_dense_gemm.py INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda. ===== STARTING FP8 GEMM BENCHMARK ===== diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index 43c54d56ca8c..ba31bc563829 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -1,67 +1,39 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# fmt: off # ruff: noqa: E501 import time -# Import DeepGEMM functions -import deep_gemm import torch -from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor -# Import vLLM functions from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, - w8a8_block_fp8_matmul, + w8a8_triton_block_scaled_mm, ) from vllm.triton_utils import triton +from vllm.utils.deep_gemm import ( + calc_diff, + fp8_gemm_nt, + get_col_major_tma_aligned_tensor, + per_block_cast_to_fp8, +) -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 -def per_token_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-token scaling.""" - assert x.dim() == 2 and x.size(1) % 128 == 0 - m, n = x.shape - x_view = x.view(m, -1, 128) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - return (x_view * (448.0 / x_amax.unsqueeze(2))).to( - torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) - - -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 -def per_block_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-block scaling.""" - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), - dtype=x.dtype, - device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - - -def benchmark_shape(m: int, - n: int, - k: int, - warmup: int = 100, - repeat: int = 10000, - verbose: bool = False) -> dict: +def benchmark_shape( + m: int, + n: int, + k: int, + warmup: int = 100, + repeat: int = 10000, + verbose: bool = False, +) -> dict: """Benchmark all implementations for a specific (m, n, k) shape.""" if verbose: print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") # Create test tensors - A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) - B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) + A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) + B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16) # Reference result in BF16 torch.cuda.synchronize() @@ -69,43 +41,48 @@ def benchmark_shape(m: int, # Pre-quantize B for all implementations # (weights can be pre-quantized offline) - B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) - B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) # Block size configuration block_size = [128, 128] # Pre-quantize A for all implementations - A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) - C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( - A, block_size[1], column_major_scales=True) + A, block_size[1], column_major_scales=True + ) # === DeepGEMM Implementation === def deepgemm_gemm(): - deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), - (B_deepgemm, B_scale_deepgemm), - C_deepgemm) + fp8_gemm_nt( + (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm + ) return C_deepgemm # === vLLM Triton Implementation === def vllm_triton_gemm(): - return w8a8_block_fp8_matmul(A_vllm, - B_vllm, - A_scale_vllm, - B_scale_vllm, - block_size, - output_dtype=torch.bfloat16) + return w8a8_triton_block_scaled_mm( + A_vllm, + B_vllm, + A_scale_vllm, + B_scale_vllm, + block_size, + output_dtype=torch.bfloat16, + ) # === vLLM CUTLASS Implementation === def vllm_cutlass_gemm(): - return ops.cutlass_scaled_mm(A_vllm_cutlass, - B_vllm.T, - scale_a=A_scale_vllm_cutlass, - scale_b=B_scale_vllm.T, - out_dtype=torch.bfloat16) + return ops.cutlass_scaled_mm( + A_vllm_cutlass, + B_vllm.T, + scale_a=A_scale_vllm_cutlass, + scale_b=B_scale_vllm.T, + out_dtype=torch.bfloat16, + ) # Run correctness check first if verbose: @@ -122,26 +99,23 @@ def vllm_cutlass_gemm(): print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") - print("vLLM Triton vs DeepGEMM difference: " - f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}") - print("vLLM CUTLASS vs DeepGEMM difference: " - f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}") + print( + "vLLM Triton vs DeepGEMM difference: " + f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}" + ) + print( + "vLLM CUTLASS vs DeepGEMM difference: " + f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}" + ) # Benchmark implementations implementations = { "DeepGEMM": deepgemm_gemm, "vLLM Triton": vllm_triton_gemm, - "vLLM CUTLASS": vllm_cutlass_gemm + "vLLM CUTLASS": vllm_cutlass_gemm, } - benchmark_results = { - "shape": { - "m": m, - "n": n, - "k": k - }, - "implementations": {} - } + benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}} for name, func in implementations.items(): # Warmup @@ -169,38 +143,36 @@ def vllm_cutlass_gemm(): "tflops": tflops, "gb_s": gb_s, "diff": { - "DeepGEMM": - 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm), - "Reference": - deepgemm_diff if name == "DeepGEMM" else - (vllm_triton_diff - if name == "vLLM Triton" else vllm_cutlass_diff) - } + "DeepGEMM": 0.0 + if name == "DeepGEMM" + else calc_diff(func(), C_deepgemm), + "Reference": deepgemm_diff + if name == "DeepGEMM" + else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff), + }, } if verbose: - print( - f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s" - ) + print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s") # Calculate speedups baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] for name, data in benchmark_results["implementations"].items(): if name != "DeepGEMM": speedup = baseline / data["time_ms"] - benchmark_results["implementations"][name][ - "speedup_vs_deepgemm"] = speedup + benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup if verbose: - print(f"DeepGEMM is {1/speedup:.2f}x " - f"{'faster' if 1/speedup > 1 else 'slower'} than {name}") + print( + f"DeepGEMM is {1 / speedup:.2f}x " + f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}" + ) - vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][ - "time_ms"] - vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][ - "time_ms"] + vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"] + vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"] cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time - benchmark_results["implementations"]["vLLM CUTLASS"][ - "speedup_vs_triton"] = cutlass_vs_triton + benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = ( + cutlass_vs_triton + ) if verbose: print( f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " @@ -212,8 +184,7 @@ def vllm_cutlass_gemm(): def format_table_row(values, widths): """Format a row with specified column widths.""" - return "| " + " | ".join(f"{val:{w}}" - for val, w in zip(values, widths)) + " |" + return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |" def print_table(headers, rows, title=None): @@ -321,38 +292,50 @@ def run_benchmarks(verbose: bool = False): for result in all_results: shape = result["shape"] impl_data = result["implementations"]["DeepGEMM"] - deepgemm_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}" - ]) + deepgemm_rows.append( + [ + shape["m"], + shape["n"], + shape["k"], + f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", + f"{impl_data['gb_s']:.1f}", + ] + ) - print_table(deepgemm_headers, - deepgemm_rows, - title="DeepGEMM Implementation:") + print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:") # Print vLLM Triton table - triton_headers = [ - "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM" - ] + triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"] triton_rows = [] for result in all_results: shape = result["shape"] impl_data = result["implementations"]["vLLM Triton"] speedup = impl_data.get("speedup_vs_deepgemm", 1.0) - triton_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", - format_speedup(speedup) - ]) + triton_rows.append( + [ + shape["m"], + shape["n"], + shape["k"], + f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", + f"{impl_data['gb_s']:.1f}", + format_speedup(speedup), + ] + ) - print_table(triton_headers, - triton_rows, - title="vLLM Triton Implementation:") + print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:") # Print vLLM CUTLASS table cutlass_headers = [ - "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM", - "vs Triton" + "m", + "n", + "k", + "Time (μs)", + "TFLOPS", + "GB/s", + "vs DeepGEMM", + "vs Triton", ] cutlass_rows = [] for result in all_results: @@ -360,28 +343,27 @@ def run_benchmarks(verbose: bool = False): impl_data = result["implementations"]["vLLM CUTLASS"] vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) vs_triton = impl_data.get("speedup_vs_triton", 1.0) - cutlass_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", - format_speedup(vs_deepgemm), - format_speedup(vs_triton) - ]) + cutlass_rows.append( + [ + shape["m"], + shape["n"], + shape["k"], + f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", + f"{impl_data['gb_s']:.1f}", + format_speedup(vs_deepgemm), + format_speedup(vs_triton), + ] + ) - print_table(cutlass_headers, - cutlass_rows, - title="vLLM CUTLASS Implementation:") + print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:") # Calculate and print averages print("\n===== AVERAGE PERFORMANCE =====") implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] avg_metrics = { - impl: { - "tflops": 0, - "gb_s": 0, - "time_ms": 0 - } - for impl in implementations + impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations } for result in all_results: @@ -399,9 +381,9 @@ def run_benchmarks(verbose: bool = False): avg_tflops = avg_metrics[impl]["tflops"] / num_shapes avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes avg_time = avg_metrics[impl]["time_ms"] / num_shapes - avg_rows.append([ - impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}" - ]) + avg_rows.append( + [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"] + ) print_table(avg_headers, avg_rows) @@ -409,21 +391,19 @@ def run_benchmarks(verbose: bool = False): avg_speedups = { "DeepGEMM vs vLLM Triton": 0, "DeepGEMM vs vLLM CUTLASS": 0, - "vLLM CUTLASS vs vLLM Triton": 0 + "vLLM CUTLASS vs vLLM Triton": 0, } for result in all_results: deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] - vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][ - "time_ms"] + vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"] - avg_speedups[ - "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time - avg_speedups[ - "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time - avg_speedups[ - "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time + avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time + avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time + avg_speedups["vLLM CUTLASS vs vLLM Triton"] += ( + vllm_triton_time / vllm_cutlass_time + ) print("\n===== AVERAGE SPEEDUPS =====") speedup_headers = ["Comparison", "Speedup"] @@ -441,8 +421,7 @@ def run_benchmarks(verbose: bool = False): for result in all_results: for impl in implementations: - avg_diff[impl] += result["implementations"][impl]["diff"][ - "Reference"] + avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"] diff_headers = ["Implementation", "Avg Diff vs Reference"] diff_rows = [] diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 4bbb36bb4359..a9af811bbe9c 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses -from collections.abc import Iterable -from typing import Any, Callable, Optional +from collections.abc import Callable, Iterable +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -55,7 +55,7 @@ def n_args(self): def __init__( self, - cuda_graph_params: Optional[CudaGraphBenchParams], + cuda_graph_params: CudaGraphBenchParams | None, label: str, sub_label: str, description: str, diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index a27f02394afb..9a057990bda5 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -95,4 +95,10 @@ ([2048, 2816], 1), ([1408, 2048], 0), ], + "CohereLabs/c4ai-command-a-03-2025": [ + ([12288, 14336], 1), + ([12288, 12288], 0), + ([12288, 73728], 1), + ([36864, 12288], 0), + ], } diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py deleted file mode 100644 index 134551bb6128..000000000000 --- a/benchmarks/kv_cache/benchmark_block_pool.py +++ /dev/null @@ -1,108 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import gc -import time -from typing import Optional - -from tabulate import tabulate - -from vllm.utils import FlexibleArgumentParser -from vllm.v1.core.block_pool import BlockPool - - -class Metric: - def __init__(self) -> None: - self.cnt: int = 0 - self.sum_v: int = 0 - self.max_v: Optional[int] = None - - def update(self, v: int) -> None: - self.cnt += 1 - self.sum_v += v - if self.max_v is None: - self.max_v = v - else: - self.max_v = max(self.max_v, v) - - def avg_v(self) -> float: - return self.sum_v * 1.0 / self.cnt - - -def main(args): - rows = [] - for allocate_block in args.allocate_blocks: - # Enforce a GC collect ahead to minimize the impact among runs - gc.collect() - block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) - - get_blocks_metric: Metric = Metric() - free_blocks_metric: Metric = Metric() - for _ in range(args.num_iteration): - t1 = time.monotonic_ns() - blocks = block_pool.get_new_blocks(allocate_block) - t2 = time.monotonic_ns() - block_pool.free_blocks(blocks) - t3 = time.monotonic_ns() - get_blocks_metric.update(t2 - t1) - free_blocks_metric.update(t3 - t2) - - if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None: - rows.append( - [ - get_blocks_metric.cnt, - args.num_gpu_blocks, - allocate_block, - get_blocks_metric.avg_v() / 1000000, - get_blocks_metric.max_v / 1000000.0, - free_blocks_metric.avg_v() / 1000000, - free_blocks_metric.max_v / 1000000.0, - ] - ) - else: - print( - "No valid metrics found." - f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}" - ) - - print( - tabulate( - rows, - headers=[ - "Iterations", - "Total\nBlocks", - "Allocated\nBlocks", - "Get Blocks\nAvg (ms)", - "Get Blocks\nMax (ms)", - "Free Blocks\nAvg (ms)", - "Free Blocks\nMax (ms)", - ], - tablefmt="grid", - floatfmt=".6f", - ) - ) - - -def invoke_main() -> None: - parser = FlexibleArgumentParser( - description="Benchmark the performance of BlockPool for KV Cache." - ) - parser.add_argument("--num-gpu-blocks", type=int, default=100000) - parser.add_argument( - "--num-iteration", - type=int, - default=1000, - help="Number of iterations to run to stablize final data readings", - ) - parser.add_argument( - "--allocate-blocks", - type=int, - nargs="*", - default=[10, 50, 100, 500, 1000], - help="Number of blocks to allocate", - ) - args = parser.parse_args() - main(args) - - -if __name__ == "__main__": - invoke_main() # pragma: no cover diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md new file mode 100644 index 000000000000..f5b5c6c97d48 --- /dev/null +++ b/benchmarks/multi_turn/README.md @@ -0,0 +1,174 @@ +# Benchmark KV Cache Offloading with Multi-Turn Conversations + +The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt` + +First start serving your model + +```bash +export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ + +vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests +``` + +The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface). + +## Synthetic Multi-Turn Conversations + +Download the following text file (used for generation of synthetic conversations) + +```bash +wget https://www.gutenberg.org/ebooks/1184.txt.utf-8 +mv 1184.txt.utf-8 pg1184.txt +``` + +The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`). + +But you may use other text files if you prefer (using this specific file is not required). + +Then run the benchmarking script + +```bash +export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ + +python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \ +--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6 +``` + +You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.). + +If successful, you will see the following output + +```bash +---------------------------------------------------------------------------------------------------- +Statistics summary: +runtime_sec = 215.810 +requests_per_sec = 0.769 +---------------------------------------------------------------------------------------------------- + count mean std min 25% 50% 75% 90% 99% max +ttft_ms 166.0 78.22 67.63 45.91 59.94 62.26 64.43 69.66 353.18 567.54 +tpot_ms 166.0 25.37 0.57 24.40 25.07 25.31 25.50 25.84 27.50 28.05 +latency_ms 166.0 2591.07 326.90 1998.53 2341.62 2573.01 2860.10 3003.50 3268.46 3862.94 +input_num_turns 166.0 7.43 4.57 1.00 3.00 7.00 11.00 13.00 17.00 17.00 +input_num_tokens 166.0 2006.20 893.56 522.00 1247.75 2019.00 2718.00 3233.00 3736.45 3899.00 +output_num_tokens 166.0 100.01 11.80 80.00 91.00 99.00 109.75 116.00 120.00 120.00 +output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 115.00 119.00 119.00 +---------------------------------------------------------------------------------------------------- +``` + +### JSON configuration file for synthetic conversations generation + +The input flag `--input-file` is used to determine the input conversations for the benchmark.
+When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations. + +The file `generate_multi_turn.json` is an example file. + +The file must contain the sections `prompt_input` and `prompt_output`. + +The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`: + +* `num_turns` - Number of total turns in the conversation (both user & assistant).
+The final value will always be rounded to an even number so each user turn has a reply. +* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation). +* `num_tokens` - Total token length of each **user** message (one turn). + +The `prompt_output` section must contain `num_tokens`: + +* `num_tokens` - Total token length of each **assistant** message (one turn). + +### Random distributions for synthetic conversations generation + +When creating an input JSON file (such as `generate_multi_turn.json`),
+every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.
+The distribution determines how to randomly sample values for the field. + +The available distributions are listed below. + +**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.
+Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`. + +#### constant + +```json +{ + "distribution": "constant", + "value": 500 +} +``` + +* `value` - the fixed integer value (always returns the same number). + +#### uniform + +```json +{ + "distribution": "uniform", + "min": 12, + "max": 18 +} +``` + +* `min` - minimum value (inclusive). +* `max` - maximum value (inclusive), should be equal or larger than min. + +#### lognormal + +```json +{ + "distribution": "lognormal", + "average": 1000, + "max": 5000 +} +``` + +You can parameterize the lognormal distribution in one of two ways: + +Using the average and optional median ratio: + +* `average` - target average value of the distribution. +* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1). + +Using the parameters of the underlying normal distribution: + +* `mean` - mean of the underlying normal distribution. +* `sigma` - standard deviation of the underlying normal distribution. + +#### zipf + +```json +{ + "distribution": "zipf", + "alpha": 1.2, + "max": 100 +} +``` + +* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers. + +#### poisson + +```json +{ + "distribution": "poisson", + "alpha": 10, + "max": 50 +} +``` + +* `alpha` - expected value (λ). Also the variance of the distribution. + +## ShareGPT Conversations + +To run with the ShareGPT data, download the following ShareGPT dataset: +`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json` + +Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py` + +```bash +python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128 +``` + +The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles. + +The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed). + +Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`. diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py new file mode 100644 index 000000000000..2674899d1cc5 --- /dev/null +++ b/benchmarks/multi_turn/bench_dataset.py @@ -0,0 +1,588 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from statistics import mean +from typing import Any, NamedTuple + +import numpy as np # type: ignore +import pandas as pd # type: ignore +from bench_utils import ( + TEXT_SEPARATOR, + Color, + logger, +) +from transformers import AutoTokenizer # type: ignore + +# Conversation ID is a string (e.g: "UzTK34D") +ConvId = str + +# A list of dicts (dicts with keys "id" and "messages") +ShareGptConversations = list[dict[str, Any]] + +# A list of dicts (dicts with keys "role" and "content") +MessagesList = list[dict[str, str]] + +# Map conversation ID to conversation messages +ConversationsMap = list[ConvId, MessagesList] + + +class Distribution(ABC): + @abstractmethod + def sample(self, size: int = 1) -> np.ndarray: + pass + + +class UniformDistribution(Distribution): + def __init__( + self, + min_val: int | float, + max_val: int | float, + is_integer: bool = True, + ) -> None: + self.min_val = min_val + self.max_val = max_val + self.is_integer = is_integer + + def sample(self, size: int = 1) -> np.ndarray: + if self.is_integer: + return np.random.randint( + int(self.min_val), int(self.max_val + 1), size=size + ) + else: + return np.random.uniform(self.min_val, self.max_val, size=size) + + def __repr__(self) -> str: + return f"UniformDistribution[{self.min_val}, {self.max_val}]" + + +class ConstantDistribution(Distribution): + def __init__(self, value: int | float) -> None: + self.value = value + self.max_val = value + + def sample(self, size: int = 1) -> np.ndarray: + return np.full(shape=size, fill_value=self.value) + + def __repr__(self) -> str: + return f"Constant[{self.value}]" + + +class ZipfDistribution(Distribution): + def __init__(self, alpha: float, max_val: int | None = None) -> None: + self.alpha = alpha + self.max_val = max_val + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.zipf(self.alpha, size=size) + if self.max_val: + samples = np.minimum(samples, self.max_val) + return samples + + def __repr__(self) -> str: + return f"ZipfDistribution[{self.alpha}]" + + +class PoissonDistribution(Distribution): + def __init__(self, alpha: float, max_val: int | None = None) -> None: + self.alpha = alpha + self.max_val = max_val + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.poisson(self.alpha, size=size) + if self.max_val: + samples = np.minimum(samples, self.max_val) + return samples + + def __repr__(self) -> str: + return f"PoissonDistribution[{self.alpha}]" + + +class LognormalDistribution(Distribution): + def __init__( + self, + mean: float | None = None, + sigma: float | None = None, + average: int | None = None, + median_ratio: float | None = None, + max_val: int | None = None, + ) -> None: + self.average = average + self.median_ratio = median_ratio + self.max_val = max_val + + if average is not None: + if average < 1: + raise ValueError("Lognormal average must be positive") + + if mean or sigma: + raise ValueError( + "When using lognormal average, you can't provide mean/sigma" + ) + + if self.median_ratio is None: + # Default value that provides relatively wide range of values + self.median_ratio = 0.85 + + # Calculate mean/sigma of np.random.lognormal based on the average + mean, sigma = self._generate_lognormal_by_median( + target_average=self.average, median_ratio=self.median_ratio + ) + else: + if mean is None or sigma is None: + raise ValueError( + "Must provide both mean and sigma if average is not used" + ) + + if mean <= 0 or sigma < 0: + raise ValueError( + "Lognormal mean must be positive and sigma must be non-negative" + ) + + # Mean and standard deviation of the underlying normal distribution + # Based on numpy.random.lognormal + self.mean = mean + self.sigma = sigma + + @staticmethod + def _generate_lognormal_by_median( + target_average: int, median_ratio: float + ) -> tuple[float, float]: + """ + Compute (mu, sigma) for a lognormal distribution given: + - a target average (mean of the distribution) + - a ratio of median / mean (controls skewness), assume mean > median + + Background: + If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma). + * mean(X) = exp(mu + sigma^2 / 2) + * median(X) = exp(mu) + + So: + median / mean = exp(mu) / exp(mu + sigma^2 / 2) + = exp(-sigma^2 / 2) + + Rearranging: + sigma^2 = 2 * ln(mean / median) + mu = ln(median) + + This gives a unique (mu, sigma) for any valid mean and median. + """ + # Check input validity: median must be smaller than mean + if median_ratio <= 0 or median_ratio >= 1: + raise ValueError("median_ratio must be in range (0, 1)") + + target_median = target_average * median_ratio + + # Solve sigma^2 = 2 * ln(mean / median) + sigma = np.sqrt(2 * np.log(target_average / target_median)) + mu = np.log(target_median) + + return mu, sigma + + def sample(self, size: int = 1) -> np.ndarray: + samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size) + + if self.average is not None: + # Scale to average + samples *= self.average / samples.mean() + + if self.max_val: + samples = np.minimum(samples, self.max_val) + + return np.round(samples).astype(int) + + def __repr__(self) -> str: + if self.average: + return ( + f"LognormalDistribution[{self.average}, " + f"{self.median_ratio}, {self.max_val}]" + ) + return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]" + + +class GenConvArgs(NamedTuple): + num_conversations: int + text_files: list[str] + input_num_turns: Distribution + input_common_prefix_num_tokens: Distribution + input_prefix_num_tokens: Distribution + input_num_tokens: Distribution + output_num_tokens: Distribution + print_stats: bool + + +def verify_field_exists( + conf: dict, field_name: str, section: str, subsection: str +) -> None: + if field_name not in conf: + raise ValueError( + f"Missing field '{field_name}' in {section=} and {subsection=}" + ) + + +def get_random_distribution( + conf: dict, section: str, subsection: str, optional: bool = False +) -> Distribution: + # section can be "prompt_input" or "prompt_output" (both required) + conf = conf[section] + + if optional and subsection not in conf: + # Optional subsection, if not found assume the value is always 0 + return ConstantDistribution(0) + + # subsection can be "num_turns", "num_tokens" or "prefix_num_tokens" + if subsection not in conf: + raise ValueError(f"Missing subsection {subsection} in section {section}") + + conf = conf[subsection] + + distribution = conf.get("distribution") + if distribution is None: + raise ValueError( + f"Missing field 'distribution' in {section=} and {subsection=}" + ) + + if distribution == "constant": + verify_field_exists(conf, "value", section, subsection) + return ConstantDistribution(conf["value"]) + + elif distribution == "zipf": + verify_field_exists(conf, "alpha", section, subsection) + max_val = conf.get("max", None) + return ZipfDistribution(conf["alpha"], max_val=max_val) + + elif distribution == "poisson": + verify_field_exists(conf, "alpha", section, subsection) + max_val = conf.get("max", None) + return PoissonDistribution(conf["alpha"], max_val=max_val) + + elif distribution == "lognormal": + max_val = conf.get("max", None) + + if "average" in conf: + # Infer lognormal mean/sigma (numpy) from input average + median_ratio = conf.get("median_ratio", None) + return LognormalDistribution( + average=conf["average"], median_ratio=median_ratio, max_val=max_val + ) + + # Use mean/sigma directly (for full control over the distribution) + verify_field_exists(conf, "mean", section, subsection) + verify_field_exists(conf, "sigma", section, subsection) + return LognormalDistribution( + mean=conf["mean"], sigma=conf["sigma"], max_val=max_val + ) + + elif distribution == "uniform": + verify_field_exists(conf, "min", section, subsection) + verify_field_exists(conf, "max", section, subsection) + + min_value = conf["min"] + max_value = conf["max"] + + assert min_value > 0 + assert min_value <= max_value + + is_integer = isinstance(min_value, int) and isinstance(max_value, int) + return UniformDistribution(min_value, max_value, is_integer) + else: + raise ValueError(f"Unknown distribution: {distribution}") + + +def parse_input_json_file(conf: dict) -> GenConvArgs: + # Validate the input file + assert isinstance(conf, dict) + required_fields = [ + "filetype", + "num_conversations", + "text_files", + "prompt_input", + "prompt_output", + ] + for field in required_fields: + assert field in conf, f"Missing field {field} in input {conf}" + + assert conf["filetype"] == "generate_conversations" + + assert conf["num_conversations"] > 0, "num_conversations should be larger than zero" + + text_files = conf["text_files"] + + assert isinstance(text_files, list), "Field 'text_files' should be a list" + assert len(text_files) > 0, ( + "Field 'text_files' should be a list with at least one file" + ) + + # Parse the parameters for the prompt input/output workload + input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns") + input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens") + input_common_prefix_num_tokens = get_random_distribution( + conf, "prompt_input", "common_prefix_num_tokens", optional=True + ) + input_prefix_num_tokens = get_random_distribution( + conf, "prompt_input", "prefix_num_tokens" + ) + output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens") + + print_stats: bool = conf.get("print_stats", False) + assert isinstance(print_stats, bool), ( + "Field 'print_stats' should be either 'true' or 'false'" + ) + + args = GenConvArgs( + num_conversations=conf["num_conversations"], + text_files=text_files, + input_num_turns=input_num_turns, + input_common_prefix_num_tokens=input_common_prefix_num_tokens, + input_prefix_num_tokens=input_prefix_num_tokens, + input_num_tokens=input_num_tokens, + output_num_tokens=output_num_tokens, + print_stats=print_stats, + ) + return args + + +def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None: + # Collect statistics + conv_stats: list[dict[Any, Any]] = [] + req_stats: list[int] = [] + + print("\nCollecting statistics...") + for messages in conversations.values(): + # messages is a list of dicts + user_tokens: list[int] = [] + assistant_tokens: list[int] = [] + request_tokens: list[int] = [] + + req_tokens = 0 + for m in messages: + content = m["content"] + num_tokens = len(tokenizer(content).input_ids) + + if m["role"] == "user": + user_tokens.append(num_tokens) + # New user prompt including all chat history + req_tokens += num_tokens + request_tokens.append(req_tokens) + + elif m["role"] == "assistant": + assistant_tokens.append(num_tokens) + # Update assistant answer + # (will be part of chat history for the next user prompt) + req_tokens += num_tokens + + item_stats = { + "conversation_turns": len(messages), + "user_tokens": mean(user_tokens), + "assistant_tokens": mean(assistant_tokens), + } + + conv_stats.append(item_stats) + req_stats.extend(request_tokens) + + # Print statistics + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99] + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + df = pd.DataFrame(conv_stats) + print(df.describe(percentiles=percentiles).transpose()) + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Request statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + df = pd.DataFrame(req_stats, columns=["request_tokens"]) + print(df.describe(percentiles=percentiles).transpose()) + print(TEXT_SEPARATOR) + + +def generate_conversations( + args: GenConvArgs, tokenizer: AutoTokenizer +) -> ConversationsMap: + # Text for all user prompts + # (text from the input text files will be appended to this line) + base_prompt_text = "Please rewrite the following text and add more content: " + base_prompt_token_count = len( + tokenizer.encode(base_prompt_text, add_special_tokens=False) + ) + + logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}") + logger.info(args) + + list_of_tokens = [] + + for filename in args.text_files: + # Load text file that will be used to generate prompts + with open(filename) as file: + data = file.read() + tokens_in_file = tokenizer.encode(data, add_special_tokens=False) + list_of_tokens.extend(tokens_in_file) + + conversations: ConversationsMap = {} + conv_id = 0 + + # Generate number of turns for every conversation + turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations) + + # Turn count should be at least 2 (one user prompt and one assistant answer) + turn_count = np.maximum(turn_count, 2) + + # Round up to an even number (every user prompt should have an answer) + turn_count = turn_count + (turn_count % 2) + + # Generate number of prefix tokens for every conversation + conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample( + args.num_conversations + ) + + # Used to reduce shared text between conversations + # (jump/skip over text sections between conversations) + base_offset = 0 + + # Common prefix size for all conversations (only 1 sample required) + common_prefix_text = "" + common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0] + if common_prefix_tokens > 0: + # Using "." at the end to separate sentences + common_prefix_text = ( + tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "." + ) + base_offset += common_prefix_tokens + + for conv_id in range(args.num_conversations): + # Generate a single conversation + messages: MessagesList = [] + + nturns = turn_count[conv_id] + + # User prompt token count per turn (with lower limit) + input_token_count: np.ndarray = args.input_num_tokens.sample(nturns) + input_token_count = np.maximum(input_token_count, base_prompt_token_count) + + # Assistant answer token count per turn (with lower limit) + output_token_count: np.ndarray = args.output_num_tokens.sample(nturns) + output_token_count = np.maximum(output_token_count, 1) + + user_turn = True + for turn_id in range(nturns): + if user_turn: + role = "user" + num_tokens = input_token_count[turn_id] + + # Generate the user prompt, + # use a unique prefix (the conv_id) for each conversation + # (to avoid shared prefix between conversations) + content = f"{conv_id} is a nice number... " + + if len(common_prefix_text) > 0 and turn_id == 0: + content = common_prefix_text + content + + # Update the number of tokens left for the content + num_tokens -= len(tokenizer.encode(content, add_special_tokens=False)) + + if turn_id == 0: + prefix_num_tokens = conv_prefix_tokens[conv_id] + if prefix_num_tokens > 0: + # Add prefix text (context) to the first turn + start_offset = base_offset + end_offset = start_offset + prefix_num_tokens + assert len(list_of_tokens) > end_offset, ( + "Not enough input text to generate " + f"{prefix_num_tokens} tokens for the " + f"prefix text ({start_offset=}, {end_offset=})" + ) + + content += f"{conv_id}, " + tokenizer.decode( + list_of_tokens[start_offset:end_offset] + ) + base_offset += prefix_num_tokens + + # Add the actual user prompt/question after the prefix text + content += base_prompt_text + num_tokens -= base_prompt_token_count + + if num_tokens > 0: + # Add text from the input file (to reach the desired token count) + start_offset = base_offset + turn_id * input_token_count.max() + end_offset = start_offset + num_tokens + assert len(list_of_tokens) > end_offset, ( + f"Not enough input text to generate {num_tokens} tokens " + f"for the prompt ({start_offset=}, {end_offset=})" + ) + + # Convert tokens back to text + content += tokenizer.decode(list_of_tokens[start_offset:end_offset]) + else: + role = "assistant" + # This content will not be used as input to the LLM server + # (actual answers will be used instead). + # Content is only required to determine the min_tokens/max_tokens + # (inputs to the LLM server). + num_tokens = output_token_count[turn_id] + assert len(list_of_tokens) > num_tokens, ( + f"Not enough input text to generate {num_tokens} " + "tokens for assistant content" + ) + content = tokenizer.decode(list_of_tokens[:num_tokens]) + + # Append the user/assistant message to the list of messages + messages.append({"role": role, "content": content}) + user_turn = not user_turn + + # Add the new conversation + conversations[f"CONV_ID_{conv_id}"] = messages + + # Increase base offset for the next conversation + base_offset += nturns + + if args.print_stats: + print_conv_stats(conversations, tokenizer) + + return conversations + + +def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap: + conversations: ConversationsMap = {} + + for item in input_list: + conv_id: str = item["id"] + assert isinstance(conv_id, str) + + assert conv_id not in conversations, ( + f"Conversation ID {conv_id} found more than once in the input" + ) + + messages: MessagesList = item["messages"] + assert isinstance(messages, list), ( + f"Conversation messages should be a list (ID: {conv_id})" + ) + assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})" + + conversations[conv_id] = messages + + logger.info(f"Using {len(conversations)} unique conversations (IDs)") + assert len(conversations) == len(input_list) + + # Print statistics about the selected conversations + stats: list[dict[str, Any]] = [] + for conv_data in conversations.values(): + stats.append({"num_turns": len(conv_data)}) + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}") + print(TEXT_SEPARATOR) + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] + conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles) + print(conv_stats.transpose()) + print(TEXT_SEPARATOR) + + return conversations + + +def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations: + output: ShareGptConversations = [] + for conv_id, conv_data in input_dict.items(): + new_item = {"id": conv_id, "messages": conv_data} + output.append(new_item) + + return output diff --git a/benchmarks/multi_turn/bench_utils.py b/benchmarks/multi_turn/bench_utils.py new file mode 100644 index 000000000000..e959a4be711c --- /dev/null +++ b/benchmarks/multi_turn/bench_utils.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging +from enum import Enum + + +class Color(Enum): + RED = "\033[91m" + GREEN = "\033[92m" + BLUE = "\033[94m" + PURPLE = "\033[95m" + CYAN = "\033[96m" + YELLOW = "\033[93m" + RESET = "\033[0m" + + def __str__(self): + return self.value + + +TEXT_SEPARATOR = "-" * 100 + +# Configure the logger +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] - %(message)s", + datefmt="%d-%m-%Y %H:%M:%S", +) +logger = logging.getLogger(__name__) diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py new file mode 100644 index 000000000000..67a085b40ed3 --- /dev/null +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -0,0 +1,1561 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import asyncio +import json +import logging +import multiprocessing as mp +import os +import random +import time +from collections import Counter, deque +from datetime import datetime +from enum import Enum +from http import HTTPStatus +from statistics import mean +from typing import NamedTuple + +import aiohttp # type: ignore +import numpy as np # type: ignore +import pandas as pd # type: ignore +from bench_dataset import ( + ConversationsMap, + ConvId, + GenConvArgs, + MessagesList, + ShareGptConversations, + conversations_dict_to_list, + conversations_list_to_dict, + generate_conversations, + parse_input_json_file, +) +from bench_utils import TEXT_SEPARATOR, Color, logger +from transformers import AutoTokenizer # type: ignore + +NUM_TOKENS_FROM_DATASET = 0 +TERM_SIGNAL = None + + +class ConversationSampling(str, Enum): + ROUND_ROBIN = "round_robin" + RANDOM = "random" + + def __str__(self): + return self.value + + +class ClientArgs(NamedTuple): + seed: int + max_num_requests: int | None + skip_first_turn: bool + max_turns: int | None + max_active_conversations: int + verbose: bool + print_content: bool + verify_output: bool + conversation_sampling: ConversationSampling + request_rate: float + + +class RequestArgs(NamedTuple): + chat_url: str + model: str + stream: bool + limit_min_tokens: int # Use negative value for no limit + limit_max_tokens: int # Use negative value for no limit + + +class BenchmarkArgs(NamedTuple): + url: str + num_clients: int + early_stop: bool + + +class ServerResponse(NamedTuple): + valid: bool + ttft_ms: float # time to first chunk + tpot_ms: float # time per output chunk (one or more tokens) + latency_ms: float + start_time_ms: float + first_chunk: str # first chunk of the content + content: str # includes the first_chunk + num_chunks: int + + def __str__(self) -> str: + return f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}" # noqa: E501 + + +class RequestStats(NamedTuple): + ttft_ms: float + tpot_ms: float + latency_ms: float + start_time_ms: float + input_num_turns: int + input_num_tokens: int + output_num_tokens: int + output_num_chunks: int + output_num_first_chunk_tokens: int + approx_cached_percent: float + conversation_id: str + client_id: int + + def __str__(self) -> str: + return ( + f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}, input_num_tokens {self.input_num_tokens}, " # noqa: E501 + f"output_num_tokens {self.output_num_tokens} ({self.output_num_chunks} chunks, {self.output_num_first_chunk_tokens} tokens in first chunk), " # noqa: E501 + f"approx_cached_percent {self.approx_cached_percent:.2f}%" + ) + + +class MetricStats: + def __init__(self) -> None: + self.min: float | None = None + self.max: float | None = None + self.avg: float | None = None + self.sum = 0.0 + self.count = 0 + + def update(self, value: float) -> None: + if self.min is None: + self.min = value + else: + self.min = min(self.min, value) + + if self.max is None: + self.max = value + else: + self.max = max(self.max, value) + + self.sum += value + self.count += 1 + self.avg = self.sum / self.count + + def __repr__(self) -> str: + if self.count == 0: + return "no data" + return f"avg: {self.avg:>10.3f}, min: {self.min:>10.3f}, max: {self.max:>10.3f}" + + +class MovingAverage: + def __init__(self, window_size: int) -> None: + self.window_size = window_size + self.window = np.zeros(window_size) + self.index = 0 + self.sum = 0.0 + self.count = 0 + self.avg: float | None = None + + def update(self, new_value: float) -> None: + if self.count < self.window_size: + # Filling up the window + self.sum += new_value + self.window[self.count] = new_value + self.count += 1 + else: + # Window is full, start replacing old values + old_value = self.window[self.index] + self.sum = self.sum - old_value + new_value + self.window[self.index] = new_value + self.index = (self.index + 1) % self.window_size + + self.avg = self.sum / self.count + + def __repr__(self) -> str: + if self.count == 0: + return "no data" + return f"avg: {self.avg:>10.3f} ({self.count} samples)" + + +class DebugStats: + def __init__(self, logger: logging.Logger, window_size: int) -> None: + self.logger = logger + self.metrics: dict[str, MovingAverage | MetricStats] = { + "moving_avg_ttft_ms": MovingAverage(window_size), + "moving_avg_tpot_ms": MovingAverage(window_size), + "ttft_ms": MetricStats(), + "tpot_ms": MetricStats(), + "latency_ms": MetricStats(), + "input_num_turns": MetricStats(), + "input_num_tokens": MetricStats(), + "output_num_tokens": MetricStats(), + } + + def update(self, data: RequestStats) -> None: + self.metrics["ttft_ms"].update(data.ttft_ms) + self.metrics["moving_avg_ttft_ms"].update(data.ttft_ms) + self.metrics["tpot_ms"].update(data.tpot_ms) + self.metrics["moving_avg_tpot_ms"].update(data.tpot_ms) + self.metrics["latency_ms"].update(data.latency_ms) + self.metrics["input_num_turns"].update(data.input_num_turns) + self.metrics["input_num_tokens"].update(data.input_num_tokens) + self.metrics["output_num_tokens"].update(data.output_num_tokens) + + def print(self) -> None: + self.logger.info("-" * 50) + for k, v in self.metrics.items(): + kv_info = f"[{k:25}] {v}" + self.logger.info(kv_info) + self.logger.info("-" * 50) + + +def nanosec_to_millisec(value: float) -> float: + return value / 1000000.0 + + +def nanosec_to_sec(value: float) -> float: + return value / 1000000000.0 + + +async def send_request( + session: aiohttp.ClientSession, + messages: list[dict[str, str]], + chat_url: str, + model: str, + stream: bool = True, + min_tokens: int | None = None, + max_tokens: int | None = None, +) -> ServerResponse: + payload = { + "model": model, + "messages": messages, + "seed": 0, + "temperature": 0.0, + } + + if stream: + payload["stream"] = True + payload["stream_options"] = {"include_usage": False} + + if min_tokens is not None: + payload["min_tokens"] = min_tokens + + if max_tokens is not None: + payload["max_tokens"] = max_tokens + + headers = {"Content-Type": "application/json"} + + # Calculate the timeout for the request + timeout_sec = 120 + if max_tokens is not None: + # Assume TPOT of 200ms and use max_tokens to determine timeout + timeout_sec = max(timeout_sec, int(max_tokens * 0.2)) + timeout = aiohttp.ClientTimeout(total=timeout_sec) + + valid_response = True + ttft: float | None = None + chunk_delay: list[int] = [] + latency: float | None = None + first_chunk = "" + generated_text = "" + + start_time: int = time.perf_counter_ns() + most_recent_timestamp: int = start_time + + async with session.post( + url=chat_url, json=payload, headers=headers, timeout=timeout + ) as response: + http_status = HTTPStatus(response.status) + if http_status == HTTPStatus.OK: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") + if chunk == "[DONE]": + # End of stream + latency = time.perf_counter_ns() - start_time + elif stream is False: + data = json.loads(chunk) + message = data["choices"][0]["message"] + assert message["role"] == "assistant" + generated_text += message["content"] + else: + timestamp: int = time.perf_counter_ns() + data = json.loads(chunk) + + # Delta is the new content/text/data + delta = data["choices"][0]["delta"] + if delta.get("content", None): + if ttft is None: + # First token + first_token_time = time.perf_counter_ns() + ttft = first_token_time - start_time + first_chunk = delta["content"] + else: + # Decoding phase + chunk_delay.append(timestamp - most_recent_timestamp) + + generated_text += delta["content"] + + most_recent_timestamp = timestamp + else: + valid_response = False + content = await response.text() + logger.warning( + f"{Color.YELLOW}Received HTTP status {http_status.value} " + f"({http_status.phrase}): {content}{Color.RESET}" + ) + + if latency is None: + latency = -1.0 + if valid_response: + # Streaming is disabled, latency was not set + latency = time.perf_counter_ns() - start_time + + if ttft is None: + # The response was a single chunk + ttft = latency + + # Each chunk may include more than one token + tpot: float = mean(chunk_delay) if len(chunk_delay) > 0 else 0.0 + num_chunks: int = len(chunk_delay) + + sr = ServerResponse( + valid=valid_response, + ttft_ms=nanosec_to_millisec(ttft) if ttft > 0.0 else -1.0, + tpot_ms=nanosec_to_millisec(tpot), + latency_ms=nanosec_to_millisec(latency), + start_time_ms=nanosec_to_millisec(start_time), + first_chunk=first_chunk, + content=generated_text, + num_chunks=num_chunks, + ) + return sr + + +def get_short_string(input: str) -> str: + n = 20 + if len(input) < 400: + return input + + return f"{input[:n]}...{input[-n:]}" + + +def get_token_count(tokenizer: AutoTokenizer, text: str) -> int: + return len(tokenizer(text, add_special_tokens=False).input_ids) + + +def get_messages_token_count( + tokenizer: AutoTokenizer, messages: list[dict[str, str]] +) -> int: + token_count = 0 + for m in messages: + token_count += get_token_count(tokenizer, m["content"]) + + return token_count + + +async def send_turn( + session: aiohttp.ClientSession, + client_id: int, + conv_id: str, + conversation_messages: MessagesList, + messages_to_use: int, + tokenizer: AutoTokenizer, + req_args: RequestArgs, + verbose: bool, + verify_output: bool, +) -> RequestStats | None: + assert messages_to_use > 0 + assert messages_to_use <= len(conversation_messages) + + messages = conversation_messages[:messages_to_use] + + # Index of the next message (the role should be "user") + index = messages_to_use - 1 + + # Verify that the message has only two keys, "role" and "content" + assert len(messages[index].keys()) == 2 + assert "role" in messages[index] and "content" in messages[index] + assert messages[index]["role"] == "user", ( + f"Failed on conversation ID {conv_id}, message role should be user" + ) + + if verbose: + print( + f"{Color.CYAN}Messages (conversation ID {conv_id}," + f" {len(messages)} turns):{Color.RESET}", + messages, + ) + + # None means that there is no upper/lower limit for the output token count + min_tokens = None if req_args.limit_min_tokens < 0 else req_args.limit_min_tokens + max_tokens = None if req_args.limit_max_tokens < 0 else req_args.limit_max_tokens + + if len(conversation_messages) > messages_to_use: + # The conversation contains an assistant answer for the next user prompt + if ( + min_tokens == NUM_TOKENS_FROM_DATASET + or max_tokens == NUM_TOKENS_FROM_DATASET + ): + # Compute number of tokens in the answer (from the input conversation) + assistant_answer = conversation_messages[messages_to_use] + answer_num_tokens = get_token_count(tokenizer, assistant_answer["content"]) + assert assistant_answer["role"] == "assistant" + + if min_tokens == NUM_TOKENS_FROM_DATASET: + min_tokens = max(1, answer_num_tokens) + + if max_tokens == NUM_TOKENS_FROM_DATASET: + max_tokens = max(1, answer_num_tokens) + + # Send the current conversation to LLM and get a response + response: ServerResponse = await send_request( + session, + messages, + req_args.chat_url, + req_args.model, + req_args.stream, + min_tokens, + max_tokens, + ) + + if response.valid is False: + # Request failed + return None + + # Compute number of tokens in input / output + input_num_tokens = get_messages_token_count(tokenizer, messages) + + # Num tokens in the user's last question + question_num_tokens = get_token_count(tokenizer, messages[index]["content"]) + + # Num tokens in the history/context of the question + assert input_num_tokens >= question_num_tokens + history_num_tokens = input_num_tokens - question_num_tokens + + # Num tokens in the LLM's answer (first chunk and full answer) + first_chunk_tokens = get_token_count(tokenizer, response.first_chunk) + + output_content = response.content + output_num_tokens = get_token_count(tokenizer, output_content) + + # Prefix caching approximated cached percent + approx_cached_percent = ( + 100.0 * (history_num_tokens / input_num_tokens) if input_num_tokens > 0 else 0.0 + ) + + # Compute the correct TTFT and TPOT (based on tokens and not chunks). + # Required because multiple output tokens may be bundled in a single chunk. + if output_num_tokens > 1 and output_num_tokens > first_chunk_tokens: + # More than one token and more than one chunk in the output + decode_ms = response.latency_ms - response.ttft_ms + decode_num_tokens = output_num_tokens - first_chunk_tokens + tpot_ms = decode_ms / decode_num_tokens + else: + # In this case: output_num_tokens == first_chunk_tokens + # Output was a single chunk (output_num_tokens > 1) + # or even a single token (output_num_tokens == 1) + tpot_ms = 0.0 + + if first_chunk_tokens > 1: + # First chunk had multiple tokens, adjust TTFT for a single token + delta_ms = (first_chunk_tokens - 1) * tpot_ms + ttft_ms = max(0.1, response.ttft_ms - delta_ms) + else: + # First chunk had only one token + ttft_ms = response.ttft_ms + + rs = RequestStats( + ttft_ms=ttft_ms, + tpot_ms=tpot_ms, + latency_ms=response.latency_ms, + start_time_ms=response.start_time_ms, + input_num_turns=len(messages), + input_num_tokens=input_num_tokens, + output_num_tokens=output_num_tokens, + output_num_chunks=response.num_chunks, + output_num_first_chunk_tokens=first_chunk_tokens, + approx_cached_percent=approx_cached_percent, + conversation_id=conv_id, + client_id=client_id, + ) + + if verbose: + print( + f"\n{Color.YELLOW}Response ({output_num_tokens} tokens):{Color.RESET}", + output_content, + ) + print(f"{Color.YELLOW}Response metrics: {rs}{Color.RESET}") + print("-" * 70) + + # Save the LLM's answer (will be used as part of the context for the next user turn) + answer_index = messages_to_use + if len(conversation_messages) > answer_index: + assert conversation_messages[answer_index]["role"] == "assistant", ( + f"Failed on conversation ID {conv_id}, message role should be assistant" + ) + + orig_content = conversation_messages[answer_index]["content"] + if verify_output: + # Compare the new answer to the answer from the input file + debug_info = ( + f"LLM/dataset answers do not match ({conv_id}):" + f"\n'{get_short_string(output_content)}' (len: {len(output_content)})," + f"\n'{get_short_string(orig_content)}' (len: {len(orig_content)})" + ) + if orig_content != output_content: + raise ValueError(debug_info) + + # Update the answer + conversation_messages[answer_index]["content"] = output_content + else: + # A user prompt that has no answer, add the answer as a new message + new_answer = {"role": "assistant", "content": output_content} + conversation_messages.append(new_answer) + + return rs + + +async def poisson_sleep(request_rate: float, verbose: bool = False) -> None: + # Generate a random time interval from the Poisson distribution + assert request_rate > 0 + + interval = np.random.exponential(1.0 / request_rate) + if verbose: + logger.info(f"Sleeping for {interval:.3f} seconds...") + await asyncio.sleep(interval) + + +async def client_main( + args: ClientArgs, + req_args: RequestArgs, + client_id: int, + tokenizer: AutoTokenizer, + stop_event: mp.Event, # type: ignore + task_queue: mp.Queue, + result_queue: mp.Queue, + conv_queue: mp.Queue, +) -> None: + logger.info( + f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501 + ) + + random.seed(args.seed) + np.random.seed(args.seed) + + # Active conversations + active_convs: ConversationsMap = {} + conv_id_queue: deque = deque(maxlen=args.max_active_conversations) + + # Keep track of how many messages have been used for each conversation + turns_count: Counter = Counter() + num_successes = 0 + num_failures = 0 + + # Track the timestamp (time.perf_counter()) + # of the last turn per conversation (only for debug) + time_of_last_turn: dict[ConvId, float] = {} + + # Flag that indicates that there are no new tasks (conversations) for the client + task_queue_empty = False + + async with aiohttp.ClientSession() as session: + # Print progress + + while task_queue_empty is False: + result = None + + if ( + args.max_num_requests + and num_successes + num_failures == args.max_num_requests + ): + logger.info( + f"{Color.YELLOW}Client {client_id} reached " + f"request limit{Color.RESET}" + ) + break + + if stop_event.is_set(): # type: ignore + logger.info( + f"{Color.YELLOW}Client {client_id} received " + f"a termination signal{Color.RESET}" + ) + break + + while ( + len(active_convs) < args.max_active_conversations + and task_queue_empty is False + ): + # Get a new conversation from the task queue + conv_id, messages = task_queue.get() + + if conv_id is TERM_SIGNAL: + task_queue_empty = True + break + + if args.skip_first_turn: + # Skip the first turn (both user and assistant), + # relevant if warmup was enabled. + # Default turns_count[conv_id] will be zero if conv_id + # was never inserted/updated in turns_count. + turns_count[conv_id] += 2 + + if turns_count[conv_id] < len(messages): + # Add new conversation + active_convs[conv_id] = messages + conv_id_queue.append(conv_id) + + if args.verbose: + logger.info( + f"{Color.GREEN}Client {client_id} will use conversation ID {conv_id} (active conversations {len(active_convs)}){Color.RESET}" # noqa: E501 + ) + + elif args.verbose: + # No more messages (conversation finished during the warmup) + logger.info( + f"{Color.YELLOW}Client {client_id} will not use conversation ID {conv_id} (all {len(messages)} messages already sent){Color.RESET}" # noqa: E501 + ) + + if len(active_convs) == 0 or task_queue_empty: + logger.info( + f"{Color.YELLOW}Client {client_id} has no more work{Color.RESET}" + ) + break + + # Pick an active conversation for the next request + if args.conversation_sampling == ConversationSampling.ROUND_ROBIN: + conv_id = conv_id_queue.pop() + else: + # ConversationSampling.RANDOM + active_ids = list(active_convs.keys()) + conv_id = random.choice(active_ids) + + messages = active_convs[conv_id] + assert isinstance(messages, list) and len(messages) > 0 + + # Update the amount of messages to use + turns_count[conv_id] += 1 + current_turn = turns_count[conv_id] + + assert current_turn < len(messages), ( + f"Turn number {current_turn} is invalid for conversation ID {conv_id}" + f" that has only {len(messages)} messages" + ) + + if args.verbose: + curr_time_sec: float = time.perf_counter() + time_since_last_turn: str | float = "N/A" + if conv_id in time_of_last_turn: + time_since_last_turn = round( + curr_time_sec - time_of_last_turn[conv_id], 3 + ) + logger.info( + f"Client {client_id} using conversation ID {conv_id} (turn: {current_turn}, time since last turn [sec]: {time_since_last_turn})" # noqa: E501 + ) + time_of_last_turn[conv_id] = curr_time_sec + + success = True + try: + result = await send_turn( + session, + client_id, + conv_id, + messages, + current_turn, + tokenizer, + req_args, + args.print_content, + args.verify_output, + ) + if result is not None: + result_queue.put(result) + else: + # None means that the request failed, + # and should not be added to the statistics. + success = False + num_failures += 1 + + logger.warning( + f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + + # Remove the conversation (should not be used again) + active_convs.pop(conv_id) + + except asyncio.exceptions.TimeoutError: + num_failures += 1 + logger.exception( + f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + break # Exit gracefully instead of raising an error + + except Exception: + num_failures += 1 + logger.exception( + f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + break # Exit gracefully instead of raising an error + + if success: + num_successes += 1 + + # Update the turns counter to include the LLM response + # The LLM response will be used as context for the next user turn + turns_count[conv_id] += 1 + + max_turns = len(messages) + if args.max_turns is not None: + # Limit the number of turns in the conversation + max_turns = min(args.max_turns, max_turns) + + if turns_count[conv_id] >= max_turns: + # Conversation has no more turns (no longer active) + # save the updated conversation (with the LLM server's answer) + conv_queue.put((conv_id, active_convs.pop(conv_id))) + if args.verbose: + logger.info( + f"{Color.GREEN}Client {client_id} finished " + f"conversation ID {conv_id}{Color.RESET}" + ) + else: + # Conversation is not finished, insert it at the back of the queue + conv_id_queue.appendleft(conv_id) + + # Sleep between requests (if lambda is positive) + if args.request_rate > 0: + await poisson_sleep(args.request_rate, args.verbose) + + # Send indication that the client is done + conv_queue.put((TERM_SIGNAL, TERM_SIGNAL)) + + logger.info( + f"{Color.CYAN}Client {client_id} is done " + f"({num_successes=}, {num_failures=}){Color.RESET}" + ) + + +def worker_function( + client_id: int, + tokenizer: AutoTokenizer, + client_args: ClientArgs, + req_args: RequestArgs, + stop_event: mp.Event, # type: ignore + task_queue: mp.Queue, + result_queue: mp.Queue, + conv_queue: mp.Queue, +) -> None: + asyncio.run( + client_main( + client_args, + req_args, + client_id, + tokenizer, + stop_event, + task_queue, + result_queue, + conv_queue, + ) + ) + + +def get_client_config( + args: argparse.Namespace, input_conv: ConversationsMap +) -> tuple[ClientArgs, RequestArgs]: + if args.num_clients < 1: + raise ValueError("Number of clients must be a positive number") + + if len(input_conv) < args.num_clients: + raise ValueError( + "Number of conversations must be equal or larger than the number of clients" + ) + + max_req_per_client: int | None = None + if args.max_num_requests is not None: + # Max number of requests per client + req_per_client = args.max_num_requests // args.num_clients + if req_per_client < 1: + raise ValueError("Number of requests should be at least one per client") + max_req_per_client = req_per_client + + max_active_conversations = args.max_active_conversations + if max_active_conversations is None: + # Each client will have only one active conversation at a time + max_active_conversations = args.num_clients + + if max_active_conversations > len(input_conv): + raise ValueError( + f"Max active conversations {max_active_conversations} " + "must be equal or less than the total number of conversations" + ) + + # Max number of active conversations per client + max_active_conv_per_client = max_active_conversations // args.num_clients + if max_active_conv_per_client < 1: + raise ValueError( + f"Max active conversations {max_active_conversations} " + "must be equal or greater than the number of clients" + ) + + # Skip the first user turn (as part of the warmup) + skip_first_turn = args.warmup_step + + # Common arguments for all clients + client_args = ClientArgs( + seed=args.seed, + max_num_requests=max_req_per_client, + skip_first_turn=skip_first_turn, + max_turns=args.max_turns, + max_active_conversations=max_active_conv_per_client, + verbose=args.verbose, + print_content=args.print_content, + verify_output=args.verify_output, + conversation_sampling=args.conversation_sampling, + request_rate=args.request_rate, + ) + + if args.limit_min_tokens > 0 or args.limit_max_tokens > 0: + if args.limit_min_tokens < 1 or args.limit_max_tokens < 1: + raise ValueError( + "Invalid min/max tokens limits (both limits should be provided)" + ) + if args.limit_min_tokens > args.limit_max_tokens: + raise ValueError( + "Invalid min/max tokens limits (min should not be larger than max)" + ) + + # Arguments for API requests + chat_url = f"{args.url}/v1/chat/completions" + model_name = args.served_model_name if args.served_model_name else args.model + + req_args = RequestArgs( + chat_url=chat_url, + model=model_name, + stream=not args.no_stream, + limit_min_tokens=args.limit_min_tokens, + limit_max_tokens=args.limit_max_tokens, + ) + + return client_args, req_args + + +async def main_mp( + client_args: ClientArgs, + req_args: RequestArgs, + bench_args: BenchmarkArgs, + tokenizer: AutoTokenizer, + input_conv: ConversationsMap, +) -> tuple[ConversationsMap, list[RequestStats]]: + # An event that will trigger graceful termination of all the clients + stop_event = mp.Event() + + # Queue for input conversations (from the input file/dataset) + task_queue: mp.Queue = mp.Queue() + + # Queue for client measurements (TTFT, TPOT, etc. for each request) + result_queue: mp.Queue = mp.Queue() + + # Queue for output conversations (with the LLM answers, sent by the server) + conv_queue: mp.Queue = mp.Queue() + output_conv: ConversationsMap = {} + client_metrics: list[RequestStats] = [] + + # Start all clients + start_time = time.perf_counter_ns() + logger.info(f"{Color.GREEN}Starting {bench_args.num_clients} clients{Color.RESET}") + + clients = [] + for client_id in range(bench_args.num_clients): + client = mp.Process( + name=f"client_{client_id}", + target=worker_function, + args=( + client_id, + tokenizer, + client_args, + req_args, + stop_event, + task_queue, + result_queue, + conv_queue, + ), + ) + clients.append(client) + client.start() + + # Submit all the input conversations as tasks for the clients + for conv_id, messages in input_conv.items(): + task_queue.put((conv_id, messages)) + + # Add termination signals for clients + for _ in range(bench_args.num_clients): + task_queue.put((TERM_SIGNAL, TERM_SIGNAL)) + + # Collect the updated conversations from all clients + num_clients_finished = 0 + total_convs = len(input_conv) + + debug_stats = DebugStats(logger, min(15 * bench_args.num_clients, 500)) + + while num_clients_finished < bench_args.num_clients: + # Collect updated conversation + conv_id, messages = conv_queue.get() + + # Collect results (measurements) + while not result_queue.empty(): + new_data = result_queue.get() + client_metrics.append(new_data) + debug_stats.update(new_data) + + if conv_id is TERM_SIGNAL: + num_clients_finished += 1 + logger.info( + f"{Color.CYAN}{num_clients_finished} out of " + f"{bench_args.num_clients} clients finished{Color.RESET}" + ) + + if bench_args.early_stop and not stop_event.is_set(): + # Once one client finished, stop all other clients. + # there is no reason to continue the benchmark with fewer clients. + logger.info( + f"{Color.YELLOW}Sending termination signal to clients{Color.RESET}" + ) + stop_event.set() + else: + output_conv[conv_id] = messages + + finished_convs = len(output_conv) + percent = finished_convs / total_convs + + # Tuned to control the print rate (can be changed if required) + print_cycle = max(3, int(bench_args.num_clients / 4)) + + if finished_convs % print_cycle == 0: + runtime_sec = nanosec_to_sec(time.perf_counter_ns() - start_time) + logger.info( + f"{Color.CYAN}Finished {finished_convs} out of {total_convs} conversations ({percent:.0%}), " # noqa: E501 + f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 + ) + + rps: str | float = round(len(client_metrics) / runtime_sec, 3) + if len(client_metrics) < (5 * bench_args.num_clients): + # Do not estimate the RPS if the number of samples is very low + # (threshold can be tuned if needed) + rps = "N/A" + + runtime_left_sec: str | float = round( + (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 + ) + if percent < 0.05: + # If less than 5% of the conversations were not finished, + # the estimation will probably be very inaccurate + # (threshold can be tuned if needed). + runtime_left_sec = "N/A" + + logger.info( + f"{Color.CYAN}Estimated req/sec {rps}, estimated runtime left {runtime_left_sec} sec{Color.RESET}" # noqa: E501 + ) + debug_stats.print() + + logger.info( + f"{Color.CYAN}All {bench_args.num_clients} clients finished{Color.RESET}" + ) + + # At this point all the clients finished, + # collect results (TTFT, TPOT, etc.) from all the clients. + # This needs to happen before calling join on the clients + # (result_queue should be emptied). + while not result_queue.empty(): + client_metrics.append(result_queue.get()) + + logger.info(f"Collected {len(client_metrics)} samples from all the clients") + + # Wait for all clients to finish + for client in clients: + logger.info( + f"{Color.CYAN}Waiting for client {client.name} " + f"(is alive: {client.is_alive()}){Color.RESET}" + ) + + client.join(timeout=120) + + if client.is_alive(): + logger.warning( + f"{Color.YELLOW}Client {client.name} will be terminated{Color.RESET}" + ) + client.terminate() + + exitcode = client.exitcode + if exitcode != 0: + logger.error( + f"{Color.RED}Client {client.name} exited " + f"with exit code {exitcode}{Color.RESET}" + ) + + logger.info( + f"All {bench_args.num_clients} clients exited (successfully " + f"finished {len(output_conv)} out of {total_convs} conversations)" + ) + + # Queues should be closed, required to avoid hang at interpreter shutdown + unfinished_tasks = 0 + while not task_queue.empty(): + task_queue.get() + unfinished_tasks += 1 + + if unfinished_tasks > 0: + # Can happen if not all tasks (conversations) have finished. + # May happen if --max-num-requests was used, + # or if an error occurred in one of the clients. + logger.debug(f"Discarding {unfinished_tasks} unfinished tasks") + + task_queue.close() + task_queue.join_thread() + + result_queue.close() + result_queue.join_thread() + + conv_queue.close() + conv_queue.join_thread() + + return output_conv, client_metrics + + +def get_filename_with_timestamp(label: str, extension: str) -> str: + time_now = datetime.now() + timestamp = time_now.strftime("%d-%m-%Y_%H-%M-%S") + filename = f"{label}__{timestamp}.{extension}" + return filename + + +def process_statistics( + client_metrics: list[RequestStats], + warmup_percentages: list[float], + test_params: dict, + verbose: bool, + gen_conv_args: GenConvArgs | None = None, + excel_output: bool = False, +) -> None: + if len(client_metrics) == 0: + logger.info("No samples to process") + return + + logger.info(f"Processing {len(client_metrics)} samples...") + + raw_data = pd.DataFrame(client_metrics) + + if verbose: + # Calculate the time between user turns in each conversation (in a new column) + raw_data = raw_data.sort_values(by=["conversation_id", "start_time_ms"]) + raw_data["time_between_user_turns_sec"] = raw_data.groupby("conversation_id")[ + "start_time_ms" + ].diff() + + # Convert milliseconds to seconds + raw_data["time_between_user_turns_sec"] = ( + raw_data["time_between_user_turns_sec"] / 1000.0 + ) + + # Final raw data should be sorted by time + raw_data = raw_data.sort_values(by=["start_time_ms"]) + raw_data["end_time_ms"] = raw_data["start_time_ms"] + raw_data["latency_ms"] + + percentiles = [0.25, 0.5, 0.75, 0.9] + + # Add more percentiles if there are enough samples + if len(raw_data) >= 100: + percentiles.append(0.99) + + if len(raw_data) >= 1000: + percentiles.append(0.999) + + if len(raw_data) >= 10000: + percentiles.append(0.9999) + + # Set precision for numbers in the output text (the dataframes) + pd.set_option("display.precision", 2) + + # Exclude parameters from RequestStats + exclude = [ + "start_time_ms", + "end_time_ms", + "output_num_first_chunk_tokens", + "approx_cached_percent", + "conversation_id", + "client_id", + ] + + print(TEXT_SEPARATOR) + print(f"{Color.YELLOW}Parameters:{Color.RESET}") + for k, v in test_params.items(): + print(f"{k}={v}") + + # conversations generation parameters + if gen_conv_args is not None: + gen_params = { + "text_files": ", ".join(gen_conv_args.text_files), + "input_num_turns": str(gen_conv_args.input_num_turns), + "input_common_prefix_num_tokens": str( + gen_conv_args.input_common_prefix_num_tokens + ), + "input_prefix_num_tokens": str(gen_conv_args.input_prefix_num_tokens), + "input_num_tokens": str(gen_conv_args.input_num_tokens), + "output_num_tokens": str(gen_conv_args.output_num_tokens), + } + + print(f"{Color.YELLOW}Conversations Generation Parameters:{Color.RESET}") + for k, v in gen_params.items(): + print(f"{k}={v}") + + print(TEXT_SEPARATOR) + + params_list = [] + df_list = [] + for percent in warmup_percentages: + # Select samples from the end (tail) of the dataframe + warmup_count = int(percent * len(raw_data)) + tail_count = len(raw_data) - warmup_count + if tail_count == 0: + # No reason to process if the count of samples is zero + break + + df = raw_data.tail(tail_count) + + # Runtime is the diff between the end of the last request + # and the start of the first request + runtime_sec = df["end_time_ms"].iloc[-1] - df["start_time_ms"].iloc[0] + + # Convert milliseconds to seconds + runtime_sec = runtime_sec / 1000.0 + requests_per_sec = float(len(df)) / runtime_sec + + params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec} + + # Generate a summary of relevant metrics (and drop irrelevant data) + df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose() + + # List for Excel file + params_list.append(params) + df_list.append(df) + + # Print the statistics summary + if percent > 0 or len(warmup_percentages) > 1: + print( + f"{Color.YELLOW}Statistics summary " + f"(assuming {percent:.0%} warmup samples):{Color.RESET}" + ) + else: + print(f"{Color.YELLOW}Statistics summary:{Color.RESET}") + + for k, v in params.items(): + if isinstance(v, float): + print(f"{k} = {v:.3f}") + else: + print(f"{k} = {v}") + print(TEXT_SEPARATOR) + print(df) + print(TEXT_SEPARATOR) + + if excel_output: + prefix = f"statistics_{test_params['num_clients']}_clients" + filename = get_filename_with_timestamp(prefix, "xlsx") + + with pd.ExcelWriter(filename, engine="xlsxwriter") as writer: + startrow = 0 + test_params_df = pd.DataFrame([test_params]) + test_params_df.to_excel( + writer, sheet_name="Summary", index=False, startrow=startrow + ) + startrow += len(test_params_df) + 3 + + if gen_conv_args is not None: + gen_params_df = pd.DataFrame([gen_params]) + gen_params_df.to_excel( + writer, sheet_name="Summary", index=False, startrow=(startrow - 1) + ) + startrow += len(gen_params_df) + 3 + + for params, df_stats in zip(params_list, df_list): + df_params = pd.DataFrame([params]) + df_params.to_excel( + writer, sheet_name="Summary", index=False, startrow=startrow + ) + startrow += len(df_params) + 2 + df_stats.to_excel( + writer, sheet_name="Summary", index=True, startrow=startrow + ) + startrow += len(df_stats) + 3 + + raw_data.to_excel(writer, sheet_name="Raw data", index=False, startrow=0) + + logger.info( + f"{Color.GREEN}Client metrics exported to file: {filename}{Color.RESET}" + ) + + +async def get_server_info(url: str) -> None: + logger.info(f"{Color.BLUE}Collecting information from server: {url}{Color.RESET}") + async with aiohttp.ClientSession() as session: + # Get server version (not mandatory, "version" endpoint may not exist) + url_version = f"{url}/version" + async with session.get(url_version) as response: + if HTTPStatus(response.status) == HTTPStatus.OK: + text = await response.text() + logger.info(f"{Color.BLUE}Server version: {text}{Color.RESET}") + + # Get available models + url_models = f"{url}/v1/models" + async with session.get(url_models) as response: + if HTTPStatus(response.status) == HTTPStatus.OK: + text = await response.text() + logger.info(f"{Color.BLUE}Models:{Color.RESET}") + models_data = json.loads(text) + models_list = models_data["data"] + for model in models_list: + model_id = model["id"] + max_model_len = model.get("max_model_len", "N/A") + logger.info( + f"{Color.BLUE}\t{model_id=}, {max_model_len=}{Color.RESET}" + ) + else: + logger.info(f"{Color.RED}Failed to get models{Color.RESET}") + + +async def main() -> None: + parser = argparse.ArgumentParser( + prog="Benchmark serving with multi-turn conversations", + description="Benchmark online inference using REST API", + ) + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + + parser.add_argument( + "-i", + "--input-file", + type=str, + required=True, + help="Input JSON file with ShareGPT conversations or " + "configuration file for generation of synthetic conversations", + ) + parser.add_argument( + "-o", + "--output-file", + type=str, + default=None, + help="Output JSON file containing conversations with updated assistant answers", + ) + + parser.add_argument( + "--seed", + type=int, + default=0, + help="Seed for random number generators (default: 0)", + ) + + parser.add_argument( + "-m", "--model", type=str, required=True, help="Path of the LLM model" + ) + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the `--model` argument. ", + ) + + parser.add_argument( + "-u", + "--url", + type=str, + default="http://localhost:8000", + help="Base URL for the LLM API server", + ) + + parser.add_argument( + "-p", + "--num-clients", + type=int, + default=1, + help="Number of clients that will send requests in parallel", + ) + parser.add_argument( + "-k", + "--max-active-conversations", + type=int, + default=None, + help="Max number of active conversations at a time (for all clients)", + ) + parser.add_argument( + "-n", + "--max-num-requests", + type=int, + default=None, + help="Max number of requests to send (total for all clients)", + ) + + parser.add_argument( + "--warmup-step", + default=False, + action="store_true", + help="Run a warmup step (using only the first turn of every conversation), " + "measurements will not be included in the final benchmark results", + ) + + parser.add_argument( + "--max-turns", + type=int, + default=None, + help="Maximum number of turns/messages per conversation, " + "includes both user and assistant messages " + "(a positive number, e.g: 2, 4, 6, etc.), disabled by default", + ) + parser.add_argument( + "--no-early-stop", + default=False, + action="store_true", + help="By default, the benchmark will stop if at least one client exits." + " Use this flag to disable this behavior", + ) + + parser.add_argument( + "--limit-max-tokens", + type=int, + default=NUM_TOKENS_FROM_DATASET, + help="Set max_tokens for the output token count of each request " + "(must also set --limit-min-tokens). " + "Overrides output token count from the input dataset. " + "Use a negative value to disable this limit.", + ) + parser.add_argument( + "--limit-min-tokens", + type=int, + default=NUM_TOKENS_FROM_DATASET, + help="Set min_tokens for the output token count of each request " + "(must also set --limit-max-tokens). " + "Overrides output token count from the input dataset. " + "Use a negative value to disable this limit.", + ) + + parser.add_argument( + "--request-rate", + type=float, + default=0, + help="Expected request rate (Poisson process) per client in requests/sec." + "Set to 0 for no delay between requests.", + ) + parser.add_argument( + "--conversation-sampling", + type=ConversationSampling, + choices=list(ConversationSampling), + default=ConversationSampling.ROUND_ROBIN, + help=( + "Strategy for selecting which conversation to use for the next request. " + "Options: 'round_robin' (cycle through conversations), " + "'random' (pick randomly)." + ), + ) + parser.add_argument( + "--verify-output", + default=False, + action="store_true", + help="Verify the LLM output (compare to the answers in the input JSON file)", + ) + + parser.add_argument( + "--no-stream", + default=False, + action="store_true", + help="Disable stream/streaming mode (set 'stream' to False in the API request)", + ) + + parser.add_argument( + "-e", + "--excel-output", + default=False, + action="store_true", + help="Export summary to Excel file (optional)", + ) + parser.add_argument( + "-v", + "--verbose", + default=False, + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--print-content", + default=False, + action="store_true", + help="Print the user prompts and the server's answers", + ) + + parser.add_argument( + "--warmup-percentages", + type=str, + default="0%", + help="Ignore the first X samples as warmup (X is a percentage)." + " A comma separated list of percentages can be used " + "(for example: --warmup-percentages=0%%,50%%)", + ) + + args = parser.parse_args() + + logger.info(args) + + logger.info(f"{Color.GREEN}Input parameters:{Color.RESET}") + logger.info(f"url={args.url}") + logger.info(f"model={args.model}") + logger.info(f"num_clients={args.num_clients}") + + if args.verify_output: + logger.info(f"{Color.PURPLE}Verify is enabled{Color.RESET}") + + # Calculate the amount of samples to filter (as warmup samples/measurements). + try: + warmup_percentages: list[float] = [0.0] + if not args.warmup_step: + # Warmup percentage can be used only if the warmup step was used + warmup_strings: list[str] = args.warmup_percentages.split(",") + warmup_strings = [x.replace("%", "") for x in warmup_strings] + warmup_percentages = [float(x) / 100 for x in warmup_strings] + + # Check for valid range (0 to 1) + for p in warmup_percentages: + assert p >= 0.0 and p < 1.0 + + # Sort from high to low warmup percentage + warmup_percentages.sort() + + logger.info( + f"Warmup percentages (percentage of samples): {warmup_percentages}" + ) + + except Exception: + raise ValueError( + f"Invalid --warmup-percentage={args.warmup_percentage}" + ) from None + + random.seed(args.seed) + np.random.seed(args.seed) + + if not os.path.exists(args.model): + raise OSError(f"Path does not exist: {args.model}") + logger.info("Loading tokenizer") + tokenizer = AutoTokenizer.from_pretrained(args.model) + + await get_server_info(args.url) + + # Load the input file (either conversations of configuration file) + logger.info(f"Reading input file: {args.input_file}") + with open(args.input_file) as f: + input_data = json.load(f) + + gen_conv_args = None + if isinstance(input_data, list): + # The conversations are stored as a list of dicts + logger.info(f"Found {len(input_data)} items in the input file") + + # Convert the list to a ConversationsMap + conversations = conversations_list_to_dict(input_data) + + elif isinstance(input_data, dict): + # The input file is a configuration file + # (type is determined by the field 'filetype') + if "filetype" not in input_data: + raise Exception( + f"Input file {args.input_file} is invalid (missing 'filetype')" + ) + + logger.info(f"Using input file with filetype: {input_data['filetype']}") + + gen_conv_args = parse_input_json_file(input_data) + + # Disable warning from "huggingface/tokenizers" + # (when using python multiprocessing and tokenizers) + os.environ["TOKENIZERS_PARALLELISM"] = "true" + + # Generate synthetic conversations + conversations = generate_conversations(gen_conv_args, tokenizer) + + else: + raise Exception(f"Input file {args.input_file} is invalid") + + if args.max_turns is not None: + if args.max_turns < 1: + raise ValueError("Max turns must be a positive number") + logger.info( + f"{Color.PURPLE}Max turns per conversation " + f"is limited to {args.max_turns}{Color.RESET}" + ) + + # Create benchmark configurations + client_args, req_args = get_client_config(args, conversations) + + bench_args = BenchmarkArgs( + url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop + ) + + # Warm-up step + if args.warmup_step: + # Only send a single user prompt from every conversation. + # max_active_conversations must be 1, + # otherwise the clients may exit after sending a single request + # (because the task queue is empty). + warmup_client_args = client_args._replace( + skip_first_turn=False, max_turns=1, max_active_conversations=1 + ) + + # Early stop should be disabled, + # all clients should finish their work before exiting + warmup_bench_args = bench_args._replace(early_stop=False) + + logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}") + conversations, _ = await main_mp( + warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations + ) + logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}") + + # Run the benchmark + start_time = time.perf_counter_ns() + client_convs, client_metrics = await main_mp( + client_args, req_args, bench_args, tokenizer, conversations + ) + total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time) + + # Calculate requests per second + total_runtime_sec = total_runtime_ms / 1000.0 + rps = len(client_metrics) / total_runtime_sec + logger.info( + f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec" + f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}" + ) + + # Benchmark parameters + params = { + "model": args.model, + "num_clients": args.num_clients, + "num_conversations": len(conversations), + "active_conversations": args.max_active_conversations, + "seed": args.seed, + } + + if args.limit_min_tokens > 0: + params["min_tokens"] = args.limit_min_tokens + + if args.limit_max_tokens > 0: + params["max_tokens"] = args.limit_max_tokens + + # Process and print statistics (and save excel file with the statistics) + process_statistics( + client_metrics, + test_params=params, + warmup_percentages=warmup_percentages, + verbose=args.verbose, + gen_conv_args=gen_conv_args, + excel_output=args.excel_output, + ) + + if args.output_file is not None: + # Write a JSON file with the updated conversations + # The "assistant" content will contain the answers from the tested LLM + output_data: ShareGptConversations = conversations_dict_to_list(client_convs) + logger.info( + f"{Color.GREEN}Writing conversations file: {args.output_file}{Color.RESET}" + ) + with open(args.output_file, "w") as f: + json.dump(output_data, f, indent=4) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py new file mode 100644 index 000000000000..fccab4d0ce21 --- /dev/null +++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py @@ -0,0 +1,354 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Download dataset from: +https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json + +Convert to OpenAI API: +export INPUT_FILE=sharegpt_20230401_clean_lang_split.json +python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128 +""" + +import argparse +import json +import random +from statistics import mean +from typing import Any + +import pandas as pd # type: ignore +import tqdm # type: ignore +from transformers import AutoTokenizer # type: ignore + + +def has_non_english_chars(text: str) -> bool: + return not text.isascii() + + +def content_is_valid( + content: str, min_content_len: int | None, max_content_len: int | None +) -> bool: + if min_content_len and len(content) < min_content_len: + return False + + if max_content_len and len(content) > max_content_len: + return False + + return has_non_english_chars(content) + + +def print_stats( + conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None +) -> None: + # Collect statistics + stats = [] + + print("\nCollecting statistics...") + for item in tqdm.tqdm(conversations): + # item has "id" and "messages" + messages = item["messages"] + + user_turns = 0 + assistant_turns = 0 + user_words = 0 + assistant_words = 0 + conv_chars = 0 + + user_tokens: list[int] = [] + assistant_tokens: list[int] = [] + + for m in messages: + content = m["content"] + conv_chars += len(content) + content_num_words = content.count(" ") + 1 + + num_tokens = 0 + if tokenizer: + num_tokens = len(tokenizer(m["content"]).input_ids) + + if m["role"] == "user": + user_turns += 1 + user_words += content_num_words + if tokenizer: + user_tokens.append(num_tokens) + + elif m["role"] == "assistant": + assistant_turns += 1 + assistant_words += content_num_words + if tokenizer: + assistant_tokens.append(num_tokens) + + # assert user_turns == assistant_turns, \ + # f"Invalid conversation ID {item['id']}" + + conv_words = user_words + assistant_words + item_stats = { + "user_turns": user_turns, + "assistant_turns": assistant_turns, + "user_words": user_words, + "assistant_words": assistant_words, + "conv_turns": len(messages), + "conv_words": conv_words, + "conv_characters": conv_chars, + } + + if len(user_tokens) > 0: + item_stats["user_tokens"] = int(mean(user_tokens)) + + if len(assistant_tokens) > 0: + item_stats["assistant_tokens"] = int(mean(assistant_tokens)) + + stats.append(item_stats) + + print("\nStatistics:") + percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999] + df = pd.DataFrame(stats) + print(df.describe(percentiles=percentiles).transpose()) + + +def convert_sharegpt_to_openai( + seed: int, + input_file: str, + output_file: str, + max_items: int | None, + min_content_len: int | None = None, + max_content_len: int | None = None, + min_turns: int | None = None, + max_turns: int | None = None, + model: str | None = None, +) -> None: + if min_turns and max_turns: + assert min_turns <= max_turns + + if min_content_len and max_content_len: + # Verify that min is not larger than max if both were given + assert min_content_len <= max_content_len + + print( + f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=}," + f" {max_content_len=}, {min_turns=}, {max_turns=}\n" + ) + + random.seed(seed) + + tokenizer = None + if model is not None: + print(f"Loading tokenizer from: {model}") + tokenizer = AutoTokenizer.from_pretrained(model) + + # Read the ShareGPT JSON file + print(f"Reading file: {input_file}") + with open(input_file, encoding="utf-8") as f: + # Should be a list of dicts + # Each dict should have "id" (string) and "conversations" (list of dicts) + sharegpt_data = json.load(f) + + assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts" + + print(f"Total items in input file: {len(sharegpt_data):,}") + + print(f"Shuffling dataset with seed {seed}") + random.shuffle(sharegpt_data) + + # Map conversation ID to the all the messages + conversation_parts: dict[str, list[Any]] = {} + + for item in tqdm.tqdm(sharegpt_data): + assert "id" in item, "Missing key 'id'" + assert "conversations" in item, "Missing key 'conversations'" + + # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.) + conv_id, _ = item["id"].split("_") + new_turns = item["conversations"] + + if conv_id not in conversation_parts: + # Start new conversation + conversation_parts[conv_id] = [] + elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0: + prev_turns = conversation_parts[conv_id][-1] + if prev_turns[-1]["from"] == new_turns[0]["from"]: + new_turns = new_turns[1:] + + if len(new_turns) > 0: + # We assume that parts are in order in the ShareGPT dataset + conversation_parts[conv_id].append(new_turns) + + dataset: list[dict[str, Any]] = [] + for conv_id, conv_parts in conversation_parts.items(): + new_item = {"id": conv_id} + + conversations: list[dict[str, str]] = [] + + # Merge all parts + for conv_part in conv_parts: + conversations.extend(conv_part) + + if len(conversations) > 0: + new_item["conversations"] = conversations + dataset.append(new_item) + + print(f"Total unique conversations (IDs) in input file: {len(dataset):,}") + + # Final output data + final_openai_dataset: list[dict] = [] + + # Filter conversations from the ShareGPT dataset and convert to OpenAI format + for item in tqdm.tqdm(dataset): + messages: list[dict] = [] + + assert "id" in item, "Missing key 'id'" + assert "conversations" in item, "Missing key 'conversations'" + + conv_id = item["id"] + conversations = item["conversations"] + + if min_turns is not None and len(conversations) < min_turns: + # Skip short conversations + continue + + # Convert each message in the conversation, up to max_turns if specified + for i, turn in enumerate(conversations): + assert "from" in turn and "value" in turn, ( + f"Invalid conversation ID {conv_id} - missing 'from' or 'value'" + ) + + role = None + turn_from = turn["from"] + + if turn_from in {"human", "user"}: + role = "user" + elif turn_from in {"gpt", "bing", "chatgpt", "bard"}: + role = "assistant" + elif turn_from == "system": + role = "system" + + assert role is not None, ( + f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid" + ) + + if i == 0 and role != "user": + # If the first message is from assistant (gpt), skip it. + # this happens when the conversation is a follow-up + # to a previous conversation (from the same user). + continue + + if max_turns is not None and i >= max_turns: + break + + # Convert message to OpenAI format (with "role" and "content") + content = turn["value"] + messages.append({"role": role, "content": content}) + + # Add the converted conversation to the OpenAI format + if len(messages) > 0: + valid_messages = True + + # First turn should always be from the user + user_turn = True + + for m in messages: + # Make sure that turns alternate between user and assistant + if (user_turn and m["role"] != "user") or ( + not user_turn and m["role"] != "assistant" + ): + valid_messages = False + break + + user_turn = not user_turn + + content = m["content"] + valid_messages = content_is_valid( + content, min_content_len, max_content_len + ) + if not valid_messages: + break + + if valid_messages is True: + final_openai_dataset.append({"id": conv_id, "messages": messages}) + + assert len(final_openai_dataset) > 0, "Final number of conversations is zero" + + print_stats(final_openai_dataset) + + print_stats_again = False + if max_items is not None and len(final_openai_dataset) > max_items: + print(f"\n\nSampling {max_items} items from the dataset...") + print_stats_again = True + final_openai_dataset = random.sample(final_openai_dataset, max_items) + + if print_stats_again: + # Print stats after the dataset changed + print_stats(final_openai_dataset, tokenizer) + + # Write the converted data to a new JSON file + final_size = len(final_openai_dataset) + print(f"\nTotal conversations converted (after filtering): {final_size:,}") + print(f"\nWriting file: {output_file}") + with open(output_file, "w", encoding="utf-8") as f: + json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Convert ShareGPT dataset to OpenAI API format" + ) + parser.add_argument("input_file", help="Path to the input ShareGPT JSON file") + parser.add_argument( + "output_file", help="Path to the output OpenAI format JSON file" + ) + parser.add_argument( + "--seed", type=int, default=0, help="Seed for random number generators" + ) + parser.add_argument( + "--max-items", + type=int, + default=None, + help="Maximum number of items in the output file", + ) + parser.add_argument( + "--min-turns", + type=int, + default=None, + help="Minimum number of turns per conversation", + ) + parser.add_argument( + "--max-turns", + type=int, + default=None, + help="Maximum number of turns per conversation", + ) + parser.add_argument( + "--min-content-len", + type=int, + default=None, + help="Min number of characters in the messages' content", + ) + parser.add_argument( + "--max-content-len", + type=int, + default=None, + help="Max number of characters in the messages' content", + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="LLM model, only the tokenizer will be used", + ) + + args = parser.parse_args() + + convert_sharegpt_to_openai( + args.seed, + args.input_file, + args.output_file, + args.max_items, + args.min_content_len, + args.max_content_len, + args.min_turns, + args.max_turns, + args.model, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/multi_turn/generate_multi_turn.json b/benchmarks/multi_turn/generate_multi_turn.json new file mode 100644 index 000000000000..03cfc7d63e8a --- /dev/null +++ b/benchmarks/multi_turn/generate_multi_turn.json @@ -0,0 +1,34 @@ +{ + "filetype": "generate_conversations", + "num_conversations": 24, + "text_files": ["pg1184.txt"], + "print_stats": false, + "prompt_input": { + "num_turns": { + "distribution": "uniform", + "min": 12, + "max": 18 + }, + "common_prefix_num_tokens": { + "distribution": "constant", + "value": 500 + }, + "prefix_num_tokens": { + "distribution": "lognormal", + "average": 1000, + "max": 5000 + }, + "num_tokens": { + "distribution": "uniform", + "min": 120, + "max": 160 + } + }, + "prompt_output": { + "num_tokens": { + "distribution": "uniform", + "min": 80, + "max": 120 + } + } +} \ No newline at end of file diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt new file mode 100644 index 000000000000..f0e1935914a1 --- /dev/null +++ b/benchmarks/multi_turn/requirements.txt @@ -0,0 +1,5 @@ +numpy>=1.24 +pandas>=2.0.0 +aiohttp>=3.10 +transformers>=4.46 +xlsxwriter>=3.2.1 \ No newline at end of file diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml deleted file mode 100644 index 65b1e09a247e..000000000000 --- a/benchmarks/pyproject.toml +++ /dev/null @@ -1,49 +0,0 @@ -# This local pyproject file is part of the migration from yapf to ruff format. -# It uses the same core rules as the main pyproject.toml file, but with the -# following differences: -# - ruff line length is overridden to 88 -# - deprecated typing ignores (UP006, UP035) have been removed - -[tool.ruff] -line-length = 88 - -[tool.ruff.lint.per-file-ignores] -"vllm/third_party/**" = ["ALL"] -"vllm/version.py" = ["F401"] -"vllm/_version.py" = ["ALL"] - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - "I", - # flake8-logging-format - "G", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # Loop control variable not used within loop body - "B007", - # f-string format - "UP032", - # Can remove once 3.10+ is the minimum Python version - "UP007", -] - -[tool.ruff.lint.isort] -known-first-party = ["vllm"] - -[tool.ruff.format] -docstring-code-format = true \ No newline at end of file diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 21fcee66d603..9bac5ea41c8d 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -1,6 +1,7 @@ include(FetchContent) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -58,6 +59,22 @@ function (find_isa CPUINFO TARGET OUT) endif() endfunction() + +function(check_sysctl TARGET OUT) + execute_process(COMMAND sysctl -n "${TARGET}" + RESULT_VARIABLE SYSCTL_RET + OUTPUT_VARIABLE SYSCTL_INFO + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(SYSCTL_RET EQUAL 0 AND + (SYSCTL_INFO STREQUAL "1" OR SYSCTL_INFO GREATER 0)) + set(${OUT} ON PARENT_SCOPE) + else() + set(${OUT} OFF PARENT_SCOPE) + endif() +endfunction() + + function (is_avx512_disabled OUT) set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512}) if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true") @@ -70,7 +87,11 @@ endfunction() is_avx512_disabled(AVX512_DISABLED) if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + message(STATUS "Apple Silicon Detected") set(APPLE_SILICON_FOUND TRUE) + set(ENABLE_NUMA OFF) + check_sysctl(hw.optional.neon ASIMD_FOUND) + check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND) else() find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) @@ -80,9 +101,9 @@ else() find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support find_isa(${CPUINFO} "S390" S390_FOUND) + find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support endif() - if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS "-mavx512f" @@ -149,9 +170,6 @@ elseif (ASIMD_FOUND) set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") endif() list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) -elseif(APPLE_SILICON_FOUND) - message(STATUS "Apple Silicon Detected") - set(ENABLE_NUMA OFF) elseif (S390_FOUND) message(STATUS "S390 detected") # Check for S390 VXE support @@ -160,27 +178,44 @@ elseif (S390_FOUND) "-mzvector" "-march=native" "-mtune=native") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") + if(RVV_FOUND) + message(FAIL_ERROR "Can't support rvv now.") + else() + list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc") + endif() else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.") endif() # # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms) # Flag to enable ACL kernels for AARCH64 platforms -if ( VLLM_BUILD_ACL STREQUAL "ON") +if (VLLM_BUILD_ACL STREQUAL "ON") set(USE_ACL ON) else() set(USE_ACL OFF) endif() -if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.8.1 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE - ) +if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) + set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.") + + if(FETCHCONTENT_SOURCE_DIR_ONEDNN) + message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}") + FetchContent_Declare( + oneDNN + SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN} + ) + else() + message(STATUS "Downloading oneDNN from GitHub") + FetchContent_Declare( + oneDNN + GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git + GIT_TAG v3.9 + GIT_PROGRESS TRUE + GIT_SHALLOW TRUE + ) + endif() if(USE_ACL) find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/) @@ -189,7 +224,8 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) endif() set(ONEDNN_AARCH64_USE_ACL "ON") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/") - endif() + add_compile_definitions(VLLM_USE_ACL) + endif() set(ONEDNN_LIBRARY_TYPE "STATIC") set(ONEDNN_BUILD_DOC "OFF") @@ -202,38 +238,23 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND) set(ONEDNN_ENABLE_ITT_TASKS "OFF") set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") + set(ONEDNN_VERBOSE "OFF") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) -elseif(POWER10_FOUND) - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.7.2 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE + add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp") + target_include_directories( + dnnl_ext + PUBLIC ${oneDNN_SOURCE_DIR}/include + PUBLIC ${oneDNN_BINARY_DIR}/include + PRIVATE ${oneDNN_SOURCE_DIR}/src ) - - set(ONEDNN_LIBRARY_TYPE "STATIC") - set(ONEDNN_BUILD_DOC "OFF") - set(ONEDNN_BUILD_EXAMPLES "OFF") - set(ONEDNN_BUILD_TESTS "OFF") - set(ONEDNN_ENABLE_WORKLOAD "INFERENCE") - set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") - set(ONEDNN_BUILD_GRAPH "OFF") - set(ONEDNN_ENABLE_JIT_PROFILING "OFF") - set(ONEDNN_ENABLE_ITT_TASKS "OFF") - set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") - set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") - set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) - - set(DNNL_CPU_RUNTIME "OMP") - - FetchContent_MakeAvailable(oneDNN) - - list(APPEND LIBS dnnl) + target_link_libraries(dnnl_ext dnnl) + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) + list(APPEND LIBS dnnl_ext) + set(USE_ONEDNN ON) +else() + set(USE_ONEDNN OFF) endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") @@ -256,11 +277,11 @@ set(VLLM_EXT_SRC "csrc/cpu/layernorm.cpp" "csrc/cpu/mla_decode.cpp" "csrc/cpu/pos_encoding.cpp" - "csrc/cpu/torch_bindings.cpp") + "csrc/cpu/torch_bindings.cpp" + "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" "csrc/cpu/shm.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) @@ -274,14 +295,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) ${VLLM_EXT_SRC}) add_compile_definitions(-DCPU_CAPABILITY_AVX512) endif() -elseif(POWER10_FOUND) - set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" - ${VLLM_EXT_SRC}) endif() -if (ASIMD_FOUND) + +if(USE_ONEDNN) set(VLLM_EXT_SRC - "csrc/cpu/quant.cpp" + "csrc/cpu/dnnl_kernels.cpp" ${VLLM_EXT_SRC}) endif() @@ -302,4 +320,4 @@ define_gpu_extension_target( WITH_SOABI ) -message(STATUS "Enabling C extension.") +message(STATUS "Enabling C extension.") \ No newline at end of file diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake index 6291475164ba..c9e7aec880b9 100644 --- a/cmake/external_projects/flashmla.cmake +++ b/cmake/external_projects/flashmla.cmake @@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR) else() FetchContent_Declare( flashmla - GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git - GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845 + GIT_REPOSITORY https://github.com/vllm-project/FlashMLA + GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f GIT_PROGRESS TRUE CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -33,22 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}") # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later. # Only build FlashMLA kernels if we are building for something compatible with # sm90a -cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}") -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS) + +set(SUPPORT_ARCHS) +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3) + list(APPEND SUPPORT_ARCHS 9.0a) +endif() +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8) + list(APPEND SUPPORT_ARCHS 10.0a) +endif() + + +cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}") +if(FLASH_MLA_ARCHS) + set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS}) + list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math") + set(FlashMLA_SOURCES - ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp - ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu - ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu - ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu) + ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp + ${flashmla_SOURCE_DIR}/csrc/pybind.cpp + ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu + ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu + ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu + ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu + ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu + ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu + ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu + ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu + ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu + ) + + set(FlashMLA_Extension_SOURCES + ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp + ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp + ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu + ) set(FlashMLA_INCLUDES + ${flashmla_SOURCE_DIR}/csrc + ${flashmla_SOURCE_DIR}/csrc/sm90 + ${flashmla_SOURCE_DIR}/csrc/cutlass/include + ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include + ) + + set(FlashMLA_Extension_INCLUDES + ${flashmla_SOURCE_DIR}/csrc + ${flashmla_SOURCE_DIR}/csrc/sm90 + ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/ ${flashmla_SOURCE_DIR}/csrc/cutlass/include - ${flashmla_SOURCE_DIR}/csrc/include) + ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include + ) set_gencode_flags_for_srcs( SRCS "${FlashMLA_SOURCES}" CUDA_ARCHS "${FLASH_MLA_ARCHS}") + set_gencode_flags_for_srcs( + SRCS "${FlashMLA_Extension_SOURCES}" + CUDA_ARCHS "${FLASH_MLA_ARCHS}") + define_gpu_extension_target( _flashmla_C DESTINATION vllm @@ -59,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS) INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES} USE_SABI 3 WITH_SOABI) + + # Keep Stable ABI for the module, but *not* for CUDA/C++ files. + # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles. + target_compile_options(_flashmla_C PRIVATE + $<$:-UPy_LIMITED_API> + $<$:-UPy_LIMITED_API>) + + define_gpu_extension_target( + _flashmla_extension_C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${FlashMLA_Extension_SOURCES} + COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES} + USE_SABI 3 + WITH_SOABI) + + # Keep Stable ABI for the module, but *not* for CUDA/C++ files. + # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles. + target_compile_options(_flashmla_extension_C PRIVATE + $<$:-UPy_LIMITED_API> + $<$:-UPy_LIMITED_API>) else() - # Create an empty target for setup.py when not targeting sm90a systems + # Create empty targets for setup.py when not targeting sm90a systems add_custom_target(_flashmla_C) + add_custom_target(_flashmla_extension_C) endif() diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake new file mode 100644 index 000000000000..5a59a409999a --- /dev/null +++ b/cmake/external_projects/qutlass.cmake @@ -0,0 +1,97 @@ +include(FetchContent) + +set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory") + +if(DEFINED ENV{QUTLASS_SRC_DIR}) + set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR}) +endif() + +if(QUTLASS_SRC_DIR) + FetchContent_Declare( + qutlass + SOURCE_DIR ${QUTLASS_SRC_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + ) +else() + FetchContent_Declare( + qutlass + GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git + GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65 + GIT_PROGRESS TRUE + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + ) +endif() + +FetchContent_Populate(qutlass) + +if(NOT qutlass_SOURCE_DIR) + message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") +endif() +message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}") + +cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}") +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS) + + if(QUTLASS_ARCHS MATCHES "10\\.0a") + set(QUTLASS_TARGET_CC 100) + elseif(QUTLASS_ARCHS MATCHES "12\\.0a") + set(QUTLASS_TARGET_CC 120) + else() + message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.") + endif() + + set(QUTLASS_SOURCES + ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp + ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu + ) + + set(QUTLASS_INCLUDES + ${qutlass_SOURCE_DIR} + ${qutlass_SOURCE_DIR}/qutlass + ${qutlass_SOURCE_DIR}/qutlass/csrc/include + ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions + ) + + if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h") + list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}") + elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h") + list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include") + message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).") + else() + message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. " + "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include") + endif() + + set_gencode_flags_for_srcs( + SRCS "${QUTLASS_SOURCES}" + CUDA_ARCHS "${QUTLASS_ARCHS}" + ) + + target_sources(_C PRIVATE ${QUTLASS_SOURCES}) + target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES}) + target_compile_definitions(_C PRIVATE + QUTLASS_DISABLE_PYBIND=1 + TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC} + ) + + set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS + $<$:--expt-relaxed-constexpr --use_fast_math -O3> + ) + +else() + if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8") + message(STATUS + "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).") + else() + message(STATUS + "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in " + "CUDA_ARCHS='${CUDA_ARCHS}'.") + endif() +endif() diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index ef45a5fbebf6..931090db50e9 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 1c2624e53c078854e0637ee566c72fe2107e75f4 + GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/cmake/hipify.py b/cmake/hipify.py index 55d378f5b111..8504f9defee9 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -16,7 +16,7 @@ from torch.utils.hipify.hipify_python import hipify -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() # Project directory where all the source + include files live. @@ -34,15 +34,14 @@ ) # Source files to convert. - parser.add_argument("sources", - help="Source files to hipify.", - nargs="*", - default=[]) + parser.add_argument( + "sources", help="Source files to hipify.", nargs="*", default=[] + ) args = parser.parse_args() # Limit include scope to project_dir only - includes = [os.path.join(args.project_dir, '*')] + includes = [os.path.join(args.project_dir, "*")] # Get absolute path for all source files. extra_files = [os.path.abspath(s) for s in args.sources] @@ -51,25 +50,31 @@ # The directory might already exist to hold object files so we ignore that. shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) - hipify_result = hipify(project_directory=args.project_dir, - output_directory=args.output_dir, - header_include_dirs=[], - includes=includes, - extra_files=extra_files, - show_detailed=True, - is_pytorch_extension=True, - hipify_extra_files_only=True) + hipify_result = hipify( + project_directory=args.project_dir, + output_directory=args.output_dir, + header_include_dirs=[], + includes=includes, + extra_files=extra_files, + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True, + ) hipified_sources = [] for source in args.sources: s_abs = os.path.abspath(source) - hipified_s_abs = (hipify_result[s_abs].hipified_path if - (s_abs in hipify_result - and hipify_result[s_abs].hipified_path is not None) - else s_abs) + hipified_s_abs = ( + hipify_result[s_abs].hipified_path + if ( + s_abs in hipify_result + and hipify_result[s_abs].hipified_path is not None + ) + else s_abs + ) hipified_sources.append(hipified_s_abs) - assert (len(hipified_sources) == len(args.sources)) + assert len(hipified_sources) == len(args.sources) # Print hipified source files. print("\n".join(hipified_sources)) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 621179a70169..f6a0d2b75be1 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR list(REMOVE_DUPLICATES _PTX_ARCHS) list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS) - # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should - # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS + # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should + # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS set(_CUDA_ARCHS) foreach(_arch ${_SRC_CUDA_ARCHS}) - if(_arch MATCHES "\\a$") + if(_arch MATCHES "[af]$") list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") - string(REPLACE "a" "" _base "${_arch}") + string(REGEX REPLACE "[af]$" "" _base "${_arch}") if ("${_base}" IN_LIST TGT_CUDA_ARCHS) list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}") list(APPEND _CUDA_ARCHS "${_arch}") @@ -467,6 +467,12 @@ function (define_gpu_extension_target GPU_MOD_NAME) if (GPU_LANGUAGE STREQUAL "HIP") # Make this target dependent on the hipify preprocessor step. add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) + # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder + target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc + ${GPU_INCLUDE_DIRECTORIES}) + else() + target_include_directories(${GPU_MOD_NAME} PRIVATE csrc + ${GPU_INCLUDE_DIRECTORIES}) endif() if (GPU_ARCHITECTURES) @@ -474,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME) ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") endif() - set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) target_compile_options(${GPU_MOD_NAME} PRIVATE $<$:${GPU_COMPILE_FLAGS}>) @@ -482,8 +487,6 @@ function (define_gpu_extension_target GPU_MOD_NAME) target_compile_definitions(${GPU_MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") - target_include_directories(${GPU_MOD_NAME} PRIVATE csrc - ${GPU_INCLUDE_DIRECTORIES}) target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 000000000000..304c0be8105f --- /dev/null +++ b/codecov.yml @@ -0,0 +1,12 @@ +codecov: + require_ci_to_pass: false + +fixes: + # Map source code paths to repository root paths + # Wildcards match any Python version (python3.*) + - "/vllm-workspace/src/vllm/::vllm/" + - "/vllm-workspace/vllm/::vllm/" + - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/" + - "/usr/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/lib/python3.*/site-packages/vllm/::vllm/" diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 55e659679701..a4a880f13cf7 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param( } } +template +__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, + float alpha, float limit) { + // clamp gate: min=None, max=limit + const float gate_f = (float)gate; + const float clamped_gate = gate_f > limit ? limit : gate_f; + + // clamp up: min=-limit, max=limit + const float up_f = (float)up; + const float clamped_up = + up_f > limit ? limit : (up_f < -limit ? -limit : up_f); + + // glu = gate * sigmoid(gate * alpha) + const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); + const float glu = clamped_gate * sigmoid_val; + + // (up + 1) * glu + return (T)((clamped_up + 1.0f) * glu); +} + +template +__global__ void swigluoai_and_mul_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., 2, d] + const int d, const float alpha, const float limit) { + const int64_t token_idx = blockIdx.x; + // TODO: Vectorize loads and stores. + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + // gate = x[..., ::2] (even indices) + const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); + // up = x[..., 1::2] (odd indices) + const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); + + out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); + } +} + } // namespace vllm #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ @@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param( PARAM); \ }); +#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \ + vllm::swigluoai_and_mul_kernel> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d, ALPHA, \ + LIMIT); \ + }); + void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); } +void swigluoai_and_mul(torch::Tensor& out, // [..., d] + torch::Tensor& input, // [..., 2 * d] + double alpha, double limit) { + LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit); +} namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index 8f24be89578b..052ff168cec4 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -24,14 +24,14 @@ #include "attention_dtypes.h" #include "attention_utils.cuh" -#include "cuda_compat.h" +#include "../cuda_compat.h" #ifdef USE_ROCM #include - #include "../quantization/fp8/amd/quant_utils.cuh" + #include "../quantization/w8a8/fp8/amd/quant_utils.cuh" typedef __hip_bfloat16 __nv_bfloat16; #else - #include "../quantization/fp8/nvidia/quant_utils.cuh" + #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh" #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) diff --git a/csrc/attention/mla/cutlass_mla_entry.cu b/csrc/attention/mla/cutlass_mla_entry.cu deleted file mode 100644 index 0319d1daf302..000000000000 --- a/csrc/attention/mla/cutlass_mla_entry.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA -void cutlass_mla_decode_sm100a(torch::Tensor const& out, - torch::Tensor const& q_nope, - torch::Tensor const& q_pe, - torch::Tensor const& kv_c_and_k_pe_cache, - torch::Tensor const& seq_lens, - torch::Tensor const& page_table, double scale); -#endif - -void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, - torch::Tensor const& q_pe, - torch::Tensor const& kv_c_and_k_pe_cache, - torch::Tensor const& seq_lens, - torch::Tensor const& page_table, double scale) { -#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA - return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache, - seq_lens, page_table, scale); -#endif - TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA"); -} diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu deleted file mode 100644 index 9d05d910dd81..000000000000 --- a/csrc/attention/mla/cutlass_mla_kernels.cu +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include "cute/tensor.hpp" - -#include "cutlass/cutlass.h" -#include "cutlass/kernel_hardware_info.h" - -#include "cutlass_extensions/common.hpp" - -#include "device/sm100_mla.hpp" -#include "kernel/sm100_mla_tile_scheduler.hpp" - -using namespace cute; -using namespace cutlass::fmha::kernel; - -template -struct MlaSm100 { - using Element = T; - using ElementAcc = float; - using ElementOut = T; - - using TileShape = Shape<_128, _128, Shape<_512, _64>>; - using TileShapeH = cute::tuple_element_t<0, TileShape>; - using TileShapeD = cute::tuple_element_t<2, TileShape>; - - // H K (D_latent D_rope) B - using ProblemShape = cute::tuple; - - using StrideQ = cute::tuple; // H D B - using StrideK = cute::tuple; // K D B - using StrideO = StrideK; // H D B - using StrideLSE = cute::tuple<_1, int>; // H B - - using TileScheduler = - std::conditional_t; - - using FmhaKernel = - cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized< - TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler, - /*kIsCpAsync=*/true>; - using Fmha = cutlass::fmha::device::MLA; -}; - -template -typename T::Fmha::Arguments args_from_options( - at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe, - at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens, - at::Tensor const& page_table, double scale) { - cutlass::KernelHardwareInfo hw_info; - hw_info.device_id = q_nope.device().index(); - hw_info.sm_count = - cutlass::KernelHardwareInfo::query_device_multiprocessor_count( - hw_info.device_id); - - int batches = q_nope.sizes()[0]; - int page_count_per_seq = page_table.sizes()[1]; - int page_count_total = kv_c_and_k_pe_cache.sizes()[0]; - int page_size = kv_c_and_k_pe_cache.sizes()[1]; - int max_seq_len = page_size * page_count_per_seq; - using TileShapeH = typename T::TileShapeH; - using TileShapeD = typename T::TileShapeD; - auto problem_shape = - cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches); - - auto [H, K, D, B] = problem_shape; - auto [D_latent, D_rope] = D; - - using StrideQ = typename T::StrideQ; - using StrideK = typename T::StrideK; - using StrideO = typename T::StrideO; - using StrideLSE = typename T::StrideLSE; - - StrideQ stride_Q_latent = cute::make_tuple( - static_cast(D_latent), _1{}, static_cast(H * D_latent)); - StrideQ stride_Q_rope = cute::make_tuple(static_cast(D_rope), _1{}, - static_cast(H * D_rope)); - StrideK stride_C = - cute::make_tuple(static_cast(D_latent + D_rope), _1{}, - static_cast(page_size * (D_latent + D_rope))); - StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq); - StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast(H)); - StrideO stride_O = cute::make_tuple(static_cast(D_latent), _1{}, - static_cast(H * D_latent)); - - using Element = typename T::Element; - using ElementOut = typename T::ElementOut; - using ElementAcc = typename T::ElementAcc; - auto Q_latent_ptr = static_cast(q_nope.data_ptr()); - auto Q_rope_ptr = static_cast(q_pe.data_ptr()); - auto C_ptr = static_cast(kv_c_and_k_pe_cache.data_ptr()); - auto scale_f = static_cast(scale); - typename T::Fmha::Arguments arguments{ - problem_shape, - {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr, - stride_C, C_ptr + D_latent, stride_C, - static_cast(seq_lens.data_ptr()), - static_cast(page_table.data_ptr()), stride_PT, page_count_total, - page_size}, - {static_cast(out.data_ptr()), stride_O, - static_cast(nullptr), stride_LSE}, - hw_info, - 1, // split_kv - nullptr, // is_var_split_kv - }; - // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute - // split_kv automatically based on batch size and sequence length to balance - // workload across available SMs. Consider using var_split_kv for manual - // control if needed. - T::Fmha::set_split_kv(arguments); - return arguments; -} - -template -void runMla(at::Tensor const& out, at::Tensor const& q_nope, - at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, - at::Tensor const& seq_lens, at::Tensor const& page_table, - float scale, cudaStream_t stream) { - using MlaSm100Type = MlaSm100; - typename MlaSm100Type::Fmha fmha; - auto arguments = args_from_options( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale); - size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments); - auto const workspace_options = - torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device()); - auto workspace = torch::empty(workspace_size, workspace_options); - - CUTLASS_CHECK(fmha.can_implement(arguments)); - - CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream)); - - CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream)); -} - -void cutlass_mla_decode_sm100a(torch::Tensor const& out, - torch::Tensor const& q_nope, - torch::Tensor const& q_pe, - torch::Tensor const& kv_c_and_k_pe_cache, - torch::Tensor const& seq_lens, - torch::Tensor const& page_table, double scale) { - TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA"); - TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor"); - TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor"); - TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3, - "kv_c_and_k_pe_cache must be a 3D tensor"); - TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor"); - TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor"); - TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor"); - - auto B_q_nope = q_nope.size(0); - auto H_q_nope = q_nope.size(1); - auto D_q_nope = q_nope.size(2); - auto B_q_pe = q_pe.size(0); - auto H_q_pe = q_pe.size(1); - auto D_q_pe = q_pe.size(2); - auto B_pt = page_table.size(0); - auto PAGE_NUM = page_table.size(1); - auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1); - auto D_ckv = kv_c_and_k_pe_cache.size(2); - auto B_o = out.size(0); - auto H_o = out.size(1); - auto D_o = out.size(2); - - TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512"); - TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64"); - TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576"); - TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128, - "H_q_nope, H_q_pe, and H_o must be equal to 128"); - TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0, - "PAGE_SIZE must be a power of 2"); - TORCH_CHECK( - B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o, - "Batch dims must be same for page_table, q_nope and q_pe, and out"); - TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0, - "PAGE_NUM must be divisible by 128 / PAGE_SIZE"); - TORCH_CHECK(D_o == 512, "D_o must be equal to 512"); - - TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half || - q_nope.dtype() == at::ScalarType::BFloat16 || - q_nope.dtype() == at::ScalarType::Float8_e4m3fn, - "q_nope must be a half, bfloat16, or float8_e4m3fn tensor"); - TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() && - q_nope.dtype() == q_pe.dtype(), - "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type"); - TORCH_CHECK(seq_lens.dtype() == torch::kInt32, - "seq_lens must be a 32-bit integer tensor"); - TORCH_CHECK(page_table.dtype() == torch::kInt32, - "page_table must be a 32-bit integer tensor"); - - auto in_dtype = q_nope.dtype(); - const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope)); - const cudaStream_t stream = - at::cuda::getCurrentCUDAStream(q_nope.get_device()); - if (in_dtype == at::ScalarType::Half) { - runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, - page_table, scale, stream); - } else if (in_dtype == at::ScalarType::BFloat16) { - runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache, - seq_lens, page_table, scale, stream); - } else if (in_dtype == at::ScalarType::Float8_e4m3fn) { - runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache, - seq_lens, page_table, scale, stream); - } else { - TORCH_CHECK(false, "Unsupported input data type of MLA"); - } -} diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp index 95e32559cd54..2d4b4a67d242 100644 --- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp +++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp @@ -125,24 +125,37 @@ class MLA { } static void set_split_kv (KernelArguments& args) { - // printf("set_split_kv start"); if (args.split_kv >= 1) return; auto [H, K, D, B] = args.problem_shape; - // std::cout << H << " " << K << " " << D << " " << B << "\n"; int sm_count = args.hw_info.sm_count; - // printf(" sm_count = %d\n", sm_count); - int max_splits = ceil_div(K, 128); - max_splits = min(16, max_splits); - // printf(" max_splits = %d\n", max_splits); + float seq_length_k = static_cast(K) / 1024.0f; + int max_splits = 1; + + if (B <= 4 && seq_length_k >= 16) { + max_splits = 16; + } + else if (B <= 8 && seq_length_k >= 4) { + max_splits = 8; + } + else if ((B <= 16 && seq_length_k >= 8) || + (B == 48 && seq_length_k >= 32)) { + max_splits = 4; + } + else if ((B <= 32 && seq_length_k >= 16) || + (B == 96 && seq_length_k >= 16)) { + max_splits = 2; + } + else { + max_splits = 1; + } + + // Wave-aware scheduling: ensure integer number of waves in K dimension int sms_per_batch = max(1, sm_count / B); - // printf(" sms_per_batch = %d\n", sms_per_batch); int split_heur = min(max_splits, sms_per_batch); int waves = ceil_div(B * split_heur, sm_count); int k_waves = ceil_div(max_splits, split_heur); int split_wave_aware = ceil_div(max_splits, k_waves); args.split_kv = split_wave_aware; - // printf(" args.split_kv = %d\n", args.split_kv); - } /// Determines whether the GEMM can execute the given problem. diff --git a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp index 2cbc2379579e..1f62c37ba4b7 100644 --- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp +++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp @@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { for (; tile_scheduler.is_valid(); ++tile_scheduler) { auto blk_coord = tile_scheduler.get_block_coord(); auto problem_shape = params.problem_shape; - auto local_split_kv = params.split_kv; + auto local_split_kv = params.split_kv; if (params.mainloop.ptr_seq != nullptr) { get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; - if (params.ptr_split_kv != nullptr) { + if (params.ptr_split_kv != nullptr) { local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; } } - if (local_split_kv <= get<3>(blk_coord)) - continue; + if (local_split_kv <= get<3>(blk_coord)) + continue; load_page_table( blk_coord, problem_shape, params.mainloop, shared_storage.tensors, pipeline_page_table, pipeline_pt_producer_state, - local_split_kv + local_split_kv ); } } @@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { CUTLASS_PRAGMA_NO_UNROLL for (; tile_scheduler.is_valid(); ++tile_scheduler) { auto blk_coord = tile_scheduler.get_block_coord(); - auto problem_shape = params.problem_shape; - auto local_split_kv = params.split_kv; + auto problem_shape = params.problem_shape; + auto local_split_kv = params.split_kv; if (params.mainloop.ptr_seq != nullptr) { get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; - if (params.ptr_split_kv != nullptr) { + if (params.ptr_split_kv != nullptr) { local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; } } - if (local_split_kv <= get<3>(blk_coord)) + if (local_split_kv <= get<3>(blk_coord)) continue; load_cpasync( blk_coord, @@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { params.mainloop_params, shared_storage.tensors, pipeline_load_qk, pipeline_load_qk_producer_state, - local_split_kv, + local_split_kv, /* must be shared pipe */ pipeline_page_table, pipeline_pt_consumer_state ); @@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { CUTLASS_PRAGMA_NO_UNROLL for (; tile_scheduler.is_valid(); ++tile_scheduler) { auto blk_coord = tile_scheduler.get_block_coord(); - auto problem_shape = params.problem_shape; - auto local_split_kv = params.split_kv; + auto problem_shape = params.problem_shape; + auto local_split_kv = params.split_kv; if (params.mainloop.ptr_seq != nullptr) { get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; - if (params.ptr_split_kv != nullptr) { - local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; - } + if (params.ptr_split_kv != nullptr) { + local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; + } } - if (local_split_kv <= get<3>(blk_coord)) + if (local_split_kv <= get<3>(blk_coord)) continue; load_tma( blk_coord, @@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { shared_storage.tensors, pipeline_load_qk, pipeline_load_qk_producer_state, pipeline_load_qk, pipeline_load_qk_producer_state, - local_split_kv + local_split_kv ); cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait(); } @@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { CUTLASS_PRAGMA_NO_UNROLL for (; tile_scheduler.is_valid(); ++tile_scheduler) { auto blk_coord = tile_scheduler.get_block_coord(); - auto problem_shape = params.problem_shape; - auto local_split_kv = params.split_kv; + auto problem_shape = params.problem_shape; + auto local_split_kv = params.split_kv; if (params.mainloop.ptr_seq != nullptr) { get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; - if (params.ptr_split_kv != nullptr) { + if (params.ptr_split_kv != nullptr) { local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; - } + } } - if (local_split_kv <= get<3>(blk_coord)) + if (local_split_kv <= get<3>(blk_coord)) continue; load_tma( blk_coord, @@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { shared_storage.tensors, pipeline_load_qk, pipeline_load_qk_producer_state, pipeline_load_qk, pipeline_load_qk_producer_state, - local_split_kv + local_split_kv ); cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait(); } @@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { for (; tile_scheduler.is_valid(); ++tile_scheduler) { auto blk_coord = tile_scheduler.get_block_coord(); auto problem_shape = params.problem_shape; - auto local_split_kv = params.split_kv; + auto local_split_kv = params.split_kv; if (params.mainloop.ptr_seq != nullptr) { get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; if (params.ptr_split_kv != nullptr) { local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; } } - if (local_split_kv <= get<3>(blk_coord)) + if (local_split_kv <= get<3>(blk_coord)) continue; mma(blk_coord, problem_shape, @@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { pipeline_mma_s, pipeline_mma_s_producer_state, pipeline_p_mma, pipeline_p_mma_consumer_state, pipeline_mma_o, pipeline_mma_o_producer_state, - local_split_kv + local_split_kv ); } } @@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { for (; tile_scheduler.is_valid(); ++tile_scheduler) { auto blk_coord = tile_scheduler.get_block_coord(); auto problem_shape = params.problem_shape; - auto split_kv = params.split_kv; - auto local_split_kv = split_kv; + auto split_kv = params.split_kv; + auto local_split_kv = split_kv; if (params.mainloop.ptr_seq != nullptr) { get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; - if (params.ptr_split_kv != nullptr) { + if (params.ptr_split_kv != nullptr) { local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; } } - if (local_split_kv <= get<3>(blk_coord)) + if (local_split_kv <= get<3>(blk_coord)) continue; compute( blk_coord, @@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { pipeline_mma_s, pipeline_mma_s_consumer_state, pipeline_p_mma, pipeline_p_mma_producer_state, pipeline_mma_o, pipeline_mma_o_consumer_state, - local_split_kv + local_split_kv ); } @@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { cutlass::arch::NamedBarrier( (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue - ).arrive(); + ).arrive_and_wait(); return; } diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index e0e95d06290d..d1874515cc8f 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -36,12 +36,14 @@ limitations under the License. #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040 void sm100_cutlass_mla_decode( torch::Tensor const& out, + torch::Tensor const& lse, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, torch::Tensor const& seq_lens, torch::Tensor const& page_table, torch::Tensor const& workspace, + double sm_scale, int64_t num_kv_splits) { TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode"); } @@ -64,11 +66,11 @@ struct IsPersistent { static const bool value = v; }; -template > +template > struct MlaSm100 { using Element = T; using ElementAcc = float; - using ElementOut = T; + using ElementOut = TOut; using TileShape = Shape<_128, _128, Shape<_512, _64>>; using TileShapeH = cute::tuple_element_t<0, TileShape>; @@ -99,6 +101,7 @@ struct MlaSm100 { template typename T::Fmha::Arguments args_from_options( at::Tensor const& out, + at::Tensor const& lse, at::Tensor const& q_nope, at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, @@ -162,12 +165,15 @@ typename T::Fmha::Arguments args_from_options( stride_PT, page_count_total, page_size}, - {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE}, + {static_cast(out.data_ptr()), + stride_O, + static_cast(lse.defined() ? lse.data_ptr() : nullptr), + stride_LSE}, hw_info, // TODO(trevor-m): Change split_kv back to -1 when // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will // perform worse with larger context length and smaller batch sizes. - num_kv_splits, // split_kv + static_cast(num_kv_splits), // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute @@ -178,9 +184,10 @@ typename T::Fmha::Arguments args_from_options( return arguments; } -template +template void runMla( at::Tensor const& out, + at::Tensor const& lse, at::Tensor const& q_nope, at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache, @@ -190,9 +197,9 @@ void runMla( double sm_scale, int64_t num_kv_splits, cudaStream_t stream) { - using MlaSm100Type = MlaSm100; + using MlaSm100Type = MlaSm100; typename MlaSm100Type::Fmha fmha; - auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits); + auto arguments = args_from_options(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits); CUTLASS_CHECK(fmha.can_implement(arguments)); @@ -214,6 +221,7 @@ void runMla( void sm100_cutlass_mla_decode( torch::Tensor const& out, + torch::Tensor const& lse, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, @@ -233,14 +241,14 @@ void sm100_cutlass_mla_decode( DISPATCH_BOOL(page_size == 128, IsPaged128, [&] { DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] { if (in_dtype == at::ScalarType::Half) { - runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + runMla>( + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::BFloat16) { - runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + runMla>( + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::Float8_e4m3fn) { - runMla>( - out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); + runMla>( + out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else { TORCH_CHECK(false, "Unsupported input data type of MLA"); } @@ -253,7 +261,7 @@ void sm100_cutlass_mla_decode( int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) { // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc) // which are float, so Element type here doesn't matter. - using MlaSm100Type = MlaSm100; + using MlaSm100Type = MlaSm100; // Get split kv. Requires problem shape and sm_count only. typename MlaSm100Type::Fmha::Arguments arguments; @@ -264,7 +272,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba // Assumes device 0 when getting sm_count. arguments.hw_info.sm_count = sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count; - arguments.split_kv = num_kv_splits; + arguments.split_kv = static_cast(num_kv_splits); MlaSm100Type::Fmha::set_split_kv(arguments); return MlaSm100Type::Fmha::get_workspace_size(arguments); diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 7a5ef10f8ef3..307300e55666 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -16,9 +16,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "attention_kernels.cuh" -#include "cuda_compat.h" +#include "../cuda_compat.h" #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -75,7 +74,7 @@ void paged_attention_v1_launcher( const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int NUM_WARPS = NUM_THREADS / WARP_SIZE; int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; int logits_size = padded_max_seq_len * sizeof(float); diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index b45b28dad05e..eb9b4feb4a89 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -16,9 +16,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "attention_kernels.cuh" -#include "cuda_compat.h" +#include "../cuda_compat.h" #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -79,7 +78,7 @@ void paged_attention_v2_launcher( const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + const int NUM_WARPS = NUM_THREADS / WARP_SIZE; int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); int logits_size = PARTITION_SIZE * sizeof(float); int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); diff --git a/csrc/cache.h b/csrc/cache.h index 0970b704be3a..b162a4a2bc31 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -40,9 +40,35 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe, void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, const double scale, const std::string& kv_cache_dtype); -void gather_cache( +void gather_and_maybe_dequant_cache( torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] torch::Tensor const& cu_seq_lens, // [BATCH+1] - int64_t batch_size, std::optional seq_starts = std::nullopt); \ No newline at end of file + int64_t batch_size, const std::string& kv_cache_dtype, + torch::Tensor const& scale, + std::optional seq_starts = std::nullopt); + +// TODO(hc): cp_gather_cache need support scaled kvcahe in the future. +void cp_gather_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, std::optional seq_starts = std::nullopt); + +// Indexer K quantization and cache function +void indexer_k_quant_and_cache( + torch::Tensor& k, // [num_tokens, head_dim] + torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride] + torch::Tensor& slot_mapping, // [num_tokens] + int64_t quant_block_size, // quantization block size + const std::string& scale_fmt); + +// Extract function to gather quantized K cache +void cp_gather_indexer_k_quant_cache( + const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride] + torch::Tensor& dst_k, // [num_tokens, head_dim] + torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4] + const torch::Tensor& block_table, // [batch_size, num_blocks] + const torch::Tensor& cu_seq_lens); // [batch_size + 1] \ No newline at end of file diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 88559c8fe718..0aa0dc14c748 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -1,21 +1,22 @@ #include #include #include +#include #include "cuda_utils.h" #include "cuda_compat.h" #include "dispatch_utils.h" +#include "quantization/vectorization_utils.cuh" #ifdef USE_ROCM - #include "quantization/fp8/amd/quant_utils.cuh" + #include "quantization/w8a8/fp8/amd/quant_utils.cuh" #else - #include "quantization/fp8/nvidia/quant_utils.cuh" + #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh" #endif #include #include -#include -#include +#include #ifdef USE_ROCM #include @@ -207,6 +208,20 @@ void copy_blocks_mla(std::vector const& kv_caches, namespace vllm { +// Used to copy/convert one element +template +struct CopyWithScaleOp { + float scale; + + __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst = static_cast(src); + } else { + dst = fp8::scaled_convert(src, scale); + } + } +}; + template __global__ void reshape_and_cache_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] @@ -222,42 +237,48 @@ __global__ void reshape_and_cache_kernel( const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { - // Padding token that should be ignored. return; } const int64_t block_idx = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; + const int h_block_count = head_size / x; // head_size//x - const int n = num_heads * head_size; - for (int i = threadIdx.x; i < n; i += blockDim.x) { - const int64_t src_key_idx = token_idx * key_stride + i; - const int64_t src_value_idx = token_idx * value_stride + i; - - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int x_idx = head_offset / x; - const int x_offset = head_offset % x; - - const int64_t tgt_key_idx = - block_idx * num_heads * (head_size / x) * block_size * x + - head_idx * (head_size / x) * block_size * x + x_idx * block_size * x + - block_offset * x + x_offset; - const int64_t tgt_value_idx = - block_idx * num_heads * head_size * block_size + - head_idx * head_size * block_size + head_offset * block_size + - block_offset; - scalar_t tgt_key = key[src_key_idx]; - scalar_t tgt_value = value[src_value_idx]; - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - key_cache[tgt_key_idx] = tgt_key; - value_cache[tgt_value_idx] = tgt_value; - } else { - key_cache[tgt_key_idx] = - fp8::scaled_convert(tgt_key, *k_scale); - value_cache[tgt_value_idx] = - fp8::scaled_convert(tgt_value, *v_scale); - } + const int h_block_idx = threadIdx.x; + if (h_block_idx >= num_heads * h_block_count) { + return; + } + + const int head_idx = h_block_idx / h_block_count; + const int h_block = h_block_idx % h_block_count; + + const scalar_t* __restrict__ key_src = + key + token_idx * key_stride + head_idx * head_size + h_block * x; + const int64_t src_value_start = + token_idx * value_stride + head_idx * head_size + h_block * x; + + cache_t* __restrict__ key_dst = + key_cache + block_idx * num_heads * h_block_count * block_size * x + + head_idx * h_block_count * block_size * x + h_block * block_size * x + + block_offset * x; + const int64_t tgt_value_start = + block_idx * num_heads * h_block_count * x * block_size + + head_idx * h_block_count * x * block_size + h_block * x * block_size + + block_offset; + + constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; + float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; + CopyWithScaleOp k_op{k_scale_val}; + float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; + CopyWithScaleOp v_op{v_scale_val}; + + vectorize_with_alignment(key_src, key_dst, x, 0, 1, k_op); + + const scalar_t* __restrict__ value_src = value + src_value_start; + cache_t* __restrict__ value_dst = value_cache + tgt_value_start; +#pragma unroll + for (int i = 0; i < x; i++) { + v_op(value_dst[i * block_size], value_src[i]); } } @@ -265,10 +286,8 @@ template __global__ void reshape_and_cache_flash_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads, - // head_size] - cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads, - // head_size] + cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below + cache_t* __restrict__ value_cache, // same above const int64_t* __restrict__ slot_mapping, // [num_tokens] const int64_t block_stride, const int64_t page_stride, const int64_t head_stride, const int64_t key_stride, @@ -282,25 +301,58 @@ __global__ void reshape_and_cache_flash_kernel( } const int64_t block_idx = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; - const int n = num_heads * head_size; - for (int i = threadIdx.x; i < n; i += blockDim.x) { - const int64_t src_key_idx = token_idx * key_stride + i; - const int64_t src_value_idx = token_idx * value_stride + i; - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int64_t tgt_key_value_idx = block_idx * block_stride + - block_offset * page_stride + - head_idx * head_stride + head_offset; - scalar_t tgt_key = key[src_key_idx]; - scalar_t tgt_value = value[src_value_idx]; - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - key_cache[tgt_key_value_idx] = tgt_key; - value_cache[tgt_key_value_idx] = tgt_value; - } else { - key_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_key, *k_scale); - value_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_value, *v_scale); + const int n_elems = num_heads * head_size; + + // pointers to the beginning of the source row for this token. + const scalar_t* __restrict__ key_src = key + token_idx * key_stride; + const scalar_t* __restrict__ value_src = value + token_idx * value_stride; + + // find the start position inside the kv-cache for this token. + cache_t* __restrict__ key_dst = + key_cache + block_idx * block_stride + block_offset * page_stride; + cache_t* __restrict__ value_dst = + value_cache + block_idx * block_stride + block_offset * page_stride; + + // this is true for the NHD layout where `head_stride == head_size` + const bool is_contiguous_heads = (head_stride == head_size); + + float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; + float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; + constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; + CopyWithScaleOp k_op{k_scale_val}; + CopyWithScaleOp v_op{v_scale_val}; + if (is_contiguous_heads) { + // NHD layout + // kv cache: [num_blocks, block_size, num_heads, head_size] + vectorize_with_alignment(key_src, key_dst, n_elems, threadIdx.x, + blockDim.x, k_op); + + vectorize_with_alignment(value_src, value_dst, n_elems, + threadIdx.x, blockDim.x, v_op); + + } else { + // HND layout: heads are strided, but each head_size segment is contiguous + // kv cache: [num_blocks, num_heads, block_size, head_size] + const int lane = threadIdx.x & 31; // 0..31 within warp + const int warp_id = threadIdx.x >> 5; // warp index within block + const int warps_per_block = blockDim.x >> 5; + + for (int head = warp_id; head < num_heads; head += warps_per_block) { + const scalar_t* __restrict__ k_src_h = key_src + head * head_size; + const scalar_t* __restrict__ v_src_h = value_src + head * head_size; + + cache_t* __restrict__ k_dst_h = + key_dst + static_cast(head) * head_stride; + cache_t* __restrict__ v_dst_h = + value_dst + static_cast(head) * head_stride; + + // within each head, let the 32 threads of the warp perform the vector + // copy + vectorize_with_alignment(k_src_h, k_dst_h, head_size, lane, 32, + k_op); + + vectorize_with_alignment(v_src_h, v_dst_h, head_size, lane, 32, + v_op); } } } @@ -349,6 +401,241 @@ __global__ void concat_and_cache_mla_kernel( copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); } +template +__global__ void concat_and_cache_ds_mla_kernel( + const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank] + const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim] + cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank + // + pe_dim)] + const int64_t* __restrict__ slot_mapping, // [num_tokens] + const int block_stride, // + const int entry_stride, // + const int kv_c_stride, // + const int k_pe_stride, // + const int kv_lora_rank, // + const int pe_dim, // + const int block_size, // + const float* scale // +) { + const int64_t token_idx = blockIdx.x; + const int64_t slot_idx = slot_mapping[token_idx]; + // NOTE: slot_idx can be -1 if the token is padded + if (slot_idx < 0) { + return; + } + const int64_t block_idx = slot_idx / block_size; + const int64_t block_offset = slot_idx % block_size; + const int64_t dst_idx_start = + block_idx * block_stride + block_offset * entry_stride; + + // For the NoPE part, each tile of 128 elements is handled by half of one warp + // (16 threads). There are 4 total tiles, so 2 warps (64 threads). + // Lanes 0 and 16 of each warp write the scale values for that warp's tiles. + // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads). + // So in total, we use 3 warps (96 threads) per block. + + // Cast kv_cache to 16_bit for RoPE values + scalar_t* kv_cache_16bit = + reinterpret_cast(&kv_cache[dst_idx_start]); + + // The last warp handles the RoPE part + if (threadIdx.x >= 64) { + // Each thread handles two elements of RoPE + const int8_t pe_idx_start = (threadIdx.x - 64) * 2; + const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start; + // Vectorized load of two 16-bit values, performed as one 32-bit load + const int32_t vals = *reinterpret_cast(&k_pe[src_idx]); + // RoPE values start after the packed 8-bit NoPE values and the + // 32-bit scales + const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start; + // Vectorized store of two 16-bit values, performed as one 32-bit store + *reinterpret_cast(&kv_cache_16bit[dst_idx]) = vals; + return; + } + + // The first two warps handle the NoPE part + const int8_t warp_idx = threadIdx.x >> 5; + const int8_t lane_idx = threadIdx.x & 31; + const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4); + + // Each thread handles 8 elements of NoPE + // Load the NoPE elements for this thread into registers + const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8); + // Vectorized load of eight 16-bit values, performed as an int4 load + const int4 vals_i4 = *reinterpret_cast(&kv_c[src_idx_start]); + const scalar_t* vals = reinterpret_cast(&vals_i4); + + // Max absolute value of this thread's elements + float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])), + fmaxf(fabsf(vals[2]), fabsf(vals[3]))), + fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])), + fmaxf(fabsf(vals[6]), fabsf(vals[7])))); + + // Warp-level reduction to find the max absolute value in each half-warp +#pragma unroll + for (int offset = 8; offset > 0; offset /= 2) { + max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16)); + } + + // Compute the scale for the tile + float tile_scale = max_abs / 448.f; + tile_scale = fmaxf(tile_scale, FLT_MIN); + + // The first lane of each half-warp writes the scale to kv_cache + if ((lane_idx == 0) || (lane_idx == 16)) { + float* kv_cache_32bit = reinterpret_cast(&kv_cache[dst_idx_start]); + const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx; + kv_cache_32bit[dst_idx] = tile_scale; + } + + // Now all threads in the block scale and write their elements + // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes) + const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8); + + uint8_t result[8]; +#pragma unroll + for (int i = 0; i < 8; i++) { + result[i] = + fp8::scaled_convert( + vals[i], tile_scale); + } + + // Store as aligned 64-bit writes + *reinterpret_cast(&kv_cache[dst_idx_base]) = + *reinterpret_cast(result); +} + +template +__global__ void indexer_k_quant_and_cache_kernel( + const scalar_t* __restrict__ k, // [num_tokens, head_dim] + cache_t* __restrict__ kv_cache, // [num_blocks, block_size, cache_stride] + const int64_t* __restrict__ slot_mapping, // [num_tokens] + const int head_dim, // dimension of each head + const int quant_block_size, // quantization block size + const int cache_block_size, // cache block size + const int cache_stride, // stride for each token in kv_cache + const bool use_ue8m0 // use ue8m0 scale format +) { + constexpr int VEC_SIZE = 4; + const int64_t token_idx = blockIdx.x; + const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x + + threadIdx.y * blockDim.x + threadIdx.x) * + VEC_SIZE; + const int64_t slot_idx = slot_mapping[token_idx]; + const int64_t block_idx = slot_idx / cache_block_size; + const int64_t block_offset = slot_idx % cache_block_size; + + // NOTE: slot_idx can be -1 if the token is padded + if (slot_idx < 0 || (head_dim_idx >= head_dim)) { + return; + } + + float2 k_val = (reinterpret_cast( + k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE]; + scalar_t* k_val_ptr = reinterpret_cast(&k_val); + float amax = 0.0f; + for (int i = 0; i < VEC_SIZE; i++) { + amax = fmaxf(amax, fabsf(float(k_val_ptr[i]))); + } +#ifndef USE_ROCM + __syncwarp(); +#endif + + // Reduced amax + for (int mask = 16; mask > 0; mask /= 2) { +#ifdef USE_ROCM + amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask)); +#else + amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask)); +#endif + } +#ifndef USE_ROCM + __syncwarp(); +#endif + float scale = fmaxf(amax, 1e-4) / 448.0f; + if (use_ue8m0) { + scale = exp2f(ceilf(log2f(scale))); + } + + const int64_t dst_offset = block_idx * cache_block_size * cache_stride + + block_offset * head_dim + head_dim_idx; + for (int i = 0; i < VEC_SIZE; i++) { + kv_cache[dst_offset + i] = + fp8::scaled_convert(k_val_ptr[i], scale); + } + if (threadIdx.x == 0) { + const int64_t dst_scale_idx = + block_idx * cache_block_size * cache_stride + + cache_block_size * head_dim + + (block_offset * head_dim + head_dim_idx) * 4 / quant_block_size; + reinterpret_cast(kv_cache)[dst_scale_idx / 4] = scale; + } +} + +template +__global__ void cp_gather_indexer_k_quant_cache_kernel( + const char* __restrict__ kv_cache, // [num_blocks, block_size, + // cache_stride] + char* __restrict__ dst_k, // [num_tokens, head_dim] + char* __restrict__ dst_scale, // [num_tokens, head_dim / quant_block_size * + // 4] + const int* __restrict__ block_table, // [batch_size, num_blocks] + const int* __restrict__ cu_seq_lens, // [batch_size + 1] + const int batch_size, // batch size + const int64_t token_stride, // stride for each token in dst_k + const int64_t head_dim, // dimension of each head + const int64_t block_stride, // stride for each block in kv_cache + const int64_t cache_token_stride, // stride for each token in kv_cache + const int64_t cache_block_size, // num_tokens for each block in kv_cache + const int num_blocks, // number of blocks + const int num_tokens, // number of tokens + const int quant_block_size // quantization block size +) { + constexpr int VEC_SIZE = sizeof(float4) / sizeof(char); + const int token_idx = blockIdx.x * blockDim.y + threadIdx.y; + const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE; + // Find batch index within a block + __shared__ int batch_idx[BLOCK_Y_SIZE]; + for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x)); + iter++) { + int tid = iter * blockDim.x + threadIdx.x; + if (tid < batch_size) { + const int seq_start = cu_seq_lens[tid]; + const int seq_end = cu_seq_lens[tid + 1]; + if (token_idx >= seq_start && token_idx < seq_end) { + batch_idx[threadIdx.y] = tid; + } + } + } + +#ifndef USE_ROCM + __syncwarp(); +#endif + + if (head_idx >= head_dim || token_idx >= num_tokens) { + return; + } + const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]]; + const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks + + inbatch_seq_idx / cache_block_size]; + const int64_t src_block_offset = block_idx * block_stride; + const int64_t cache_inblock_offset = + (inbatch_seq_idx % cache_block_size) * head_dim + head_idx; + const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset; + const int64_t dst_inblock_offset = token_idx * token_stride + head_idx; + + reinterpret_cast(dst_k)[dst_inblock_offset / VEC_SIZE] = + reinterpret_cast(kv_cache)[src_inblock_offset / VEC_SIZE]; + ; + if (threadIdx.x == 0) { + const int64_t src_scale_offset = + src_block_offset + cache_block_size * head_dim + + cache_inblock_offset * 4 / quant_block_size; + reinterpret_cast(dst_scale)[dst_inblock_offset / quant_block_size] = + reinterpret_cast(kv_cache)[src_scale_offset / 4]; + } +} + } // namespace vllm // KV_T is the data type of key and value tensors. @@ -384,14 +671,15 @@ void reshape_and_cache( int key_stride = key.stride(0); int value_stride = value.stride(0); + int head_div_x = head_size / x; dim3 grid(num_tokens); - dim3 block(std::min(num_heads * head_size, 512)); + dim3 block(std::min(num_heads * head_div_x, 512)); const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype, - CALL_RESHAPE_AND_CACHE) + CALL_RESHAPE_AND_CACHE); } // KV_T is the data type of key and value tensors. @@ -462,6 +750,18 @@ void reshape_and_cache_flash( kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ reinterpret_cast(scale.data_ptr())); +// KV_T is the data type of key and value tensors. +// CACHE_T is the stored data type of kv-cache. +#define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE) \ + vllm::concat_and_cache_ds_mla_kernel \ + <<>>( \ + reinterpret_cast(kv_c.data_ptr()), \ + reinterpret_cast(k_pe.data_ptr()), \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, entry_stride, \ + kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ + reinterpret_cast(scale.data_ptr())); + void concat_and_cache_mla( torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] torch::Tensor& k_pe, // [num_tokens, pe_dim] @@ -484,20 +784,43 @@ void concat_and_cache_mla( int pe_dim = k_pe.size(1); int block_size = kv_cache.size(1); - TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); + if (kv_cache_dtype == "fp8_ds_mla") { + TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla"); + TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla"); + TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(), + "kv_cache.size(2) must be 656 bytes for fp8_ds_mla"); + TORCH_CHECK(kv_c.itemsize() == 2, + "kv_c.itemsize() must be 2 for fp8_ds_mla"); + TORCH_CHECK(k_pe.itemsize() == 2, + "k_pe.itemsize() must be 2 for fp8_ds_mla"); + } else { + TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); + } int kv_c_stride = kv_c.stride(0); int k_pe_stride = k_pe.stride(0); int block_stride = kv_cache.stride(0); int entry_stride = kv_cache.stride(1); - dim3 grid(num_tokens); - dim3 block(std::min(kv_lora_rank, 512)); const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, - CALL_CONCAT_AND_CACHE_MLA); + if (kv_cache_dtype == "fp8_ds_mla") { + dim3 grid(num_tokens); + // For the NoPE part, each tile of 128 elements is handled by half of one + // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads). + // Lanes 0 and 16 of each warp write the scale values for that warp's tiles. + // The RoPE part (last 64 elements) is handled by another 1 warp (32 + // threads). So in total, we use 3 warps (96 threads) per block. + dim3 block(96); + DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, + CALL_CONCAT_AND_CACHE_DS_MLA); + } else { + dim3 grid(num_tokens); + dim3 block(std::min(kv_lora_rank, 512)); + DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, + CALL_CONCAT_AND_CACHE_MLA); + } } namespace vllm { @@ -578,9 +901,9 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, namespace vllm { // grid is launched with dimensions (batch, num_splits) -template -__global__ void gather_cache( - const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, +template +__global__ void gather_and_maybe_dequant_cache( + const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, // ENTRIES...] scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...] const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] @@ -588,6 +911,7 @@ __global__ void gather_cache( const int32_t block_size, const int32_t entry_size, const int64_t block_table_stride, const int64_t cache_block_stride, const int64_t cache_entry_stride, const int64_t dst_entry_stride, + const float* __restrict__ scale, const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per // batch @@ -629,10 +953,16 @@ __global__ void gather_cache( if (partial_block_size) full_blocks_end -= 1; } - auto copy_entry = [&](const scalar_t* __restrict__ _src, + auto copy_entry = [&](const cache_t* __restrict__ _src, scalar_t* __restrict__ _dst) { - for (int i = threadIdx.x; i < entry_size; i += blockDim.x) - _dst[i] = _src[i]; + for (int i = threadIdx.x; i < entry_size; i += blockDim.x) { + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + _dst[i] = static_cast(_src[i]); + } else { + _dst[i] = + fp8::scaled_convert(_src[i], *scale); + } + } }; for (int pid = split_start; pid < full_blocks_end; ++pid) { @@ -659,8 +989,144 @@ __global__ void gather_cache( } // namespace vllm // Macro to dispatch the kernel based on the data type. -#define CALL_GATHER_CACHE(CPY_DTYPE) \ - vllm::gather_cache<<>>( \ +// SCALAR_T is the data type of the destination tensor. +// CACHE_T is the stored data type of kv-cache. +// KV_DTYPE is the real data type of kv-cache. +#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \ + vllm::gather_and_maybe_dequant_cache \ + <<>>( \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + block_size, entry_size, block_table_stride, cache_block_stride, \ + cache_entry_stride, dst_entry_stride, \ + reinterpret_cast(scale.data_ptr()), seq_starts_ptr); + +// Gather sequences from the cache into the destination tensor. +// - cu_seq_lens contains the cumulative sequence lengths for each batch +// - block_table contains the cache block indices for each sequence +// - Optionally, seq_starts (if provided) offsets the starting block index by +// (seq_starts[bid] / page_size) +void gather_and_maybe_dequant_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + int64_t batch_size, const std::string& kv_cache_dtype, + torch::Tensor const& scale, + std::optional seq_starts = std::nullopt) { + at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int32_t block_size = src_cache.size(1); + int32_t entry_size = src_cache.flatten(2, -1).size(2); + + TORCH_CHECK(block_table.dtype() == torch::kInt32, + "block_table must be int32"); + TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32, + "cu_seq_lens must be int32"); + if (seq_starts.has_value()) { + TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, + "seq_starts must be int32"); + } + + TORCH_CHECK(src_cache.device() == dst.device(), + "src_cache and dst must be on the same device"); + TORCH_CHECK(src_cache.device() == block_table.device(), + "src_cache and block_table must be on the same device"); + TORCH_CHECK(src_cache.device() == cu_seq_lens.device(), + "src_cache and cu_seq_lens must be on the same device"); + if (seq_starts.has_value()) { + TORCH_CHECK(src_cache.device() == seq_starts.value().device(), + "src_cache and seq_starts must be on the same device"); + } + + int64_t block_table_stride = block_table.stride(0); + int64_t cache_block_stride = src_cache.stride(0); + int64_t cache_entry_stride = src_cache.stride(1); + int64_t dst_entry_stride = dst.stride(0); + + // Decide on the number of splits based on the batch size. + int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; + dim3 grid(batch_size, num_splits); + dim3 block(1024); + + const int32_t* seq_starts_ptr = + seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; + + DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE); +} + +namespace vllm { +template +// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by +// block_size. +__global__ void cp_gather_cache( + const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, + // ENTRY_SIZE] + scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRY_SIZE] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] + const int32_t block_size, const int32_t entry_size, + const int64_t block_table_stride, const int64_t cache_block_stride, + const int64_t cache_entry_stride, const int64_t dst_entry_stride, + const int32_t* __restrict__ seq_starts // Optional: starting offsets per + // batch +) { + const int64_t bid = blockIdx.x; // Batch ID + const int32_t num_splits = gridDim.y; + const int32_t split = blockIdx.y; + const int32_t seq_start = cu_seq_lens[bid]; + const int32_t seq_end = cu_seq_lens[bid + 1]; + const int32_t seq_len = seq_end - seq_start; + const int32_t tot_slots = seq_len; + const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits); + + const int32_t split_start = split * split_slots; + const int32_t split_end = min((split + 1) * split_slots, tot_slots); + + const bool is_active_split = (split_start < tot_slots); + + if (!is_active_split) return; + + // Adjust the pointer for the block_table for this batch. + // If seq_starts is provided, compute an offset based on it + const int32_t batch_offset = bid * block_table_stride; + int32_t offset = split_start; + if (seq_starts != nullptr) { + offset += seq_starts[bid]; + } + int32_t offset_div = offset / block_size; + offset = offset % block_size; + const int32_t* batch_block_table = block_table + batch_offset; + + // Adjust dst pointer based on the cumulative sequence lengths. + dst += seq_start * dst_entry_stride; + + auto copy_entry = [&](const scalar_t* __restrict__ _src, + scalar_t* __restrict__ _dst) { + for (int i = threadIdx.x; i < entry_size; i += blockDim.x) + _dst[i] = _src[i]; + }; + + for (int pid = split_start; pid < split_end; ++pid) { + auto block_id = batch_block_table[offset_div]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = dst + pid * dst_entry_stride; + copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr); + offset += 1; + // bump to next block + if (offset == block_size) { + offset_div += 1; + offset = 0; + } + } +} +} // namespace vllm + +// Macro to dispatch the kernel based on the data type. +#define CALL_CP_GATHER_CACHE(CPY_DTYPE) \ + vllm::cp_gather_cache<<>>( \ reinterpret_cast(src_cache.data_ptr()), \ reinterpret_cast(dst.data_ptr()), \ block_table.data_ptr(), cu_seq_lens.data_ptr(), \ @@ -670,9 +1136,9 @@ __global__ void gather_cache( // Gather sequences from the cache into the destination tensor. // - cu_seq_lens contains the cumulative sequence lengths for each batch // - block_table contains the cache block indices for each sequence -// - Optionally, seq_starts (if provided) offsets the starting block index by -// (seq_starts[bid] / page_size) -void gather_cache( +// - Optionally, seq_starts (if provided) offsets the starting slot index by +// seq_starts[bid] +void cp_gather_cache( torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] @@ -723,12 +1189,107 @@ void gather_cache( seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; if (dtype_bits == 32) { - CALL_GATHER_CACHE(uint32_t); + CALL_CP_GATHER_CACHE(uint32_t); } else if (dtype_bits == 16) { - CALL_GATHER_CACHE(uint16_t); + CALL_CP_GATHER_CACHE(uint16_t); } else if (dtype_bits == 8) { - CALL_GATHER_CACHE(uint8_t); + CALL_CP_GATHER_CACHE(uint8_t); } else { TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits); } } + +// Macro to dispatch the kernel based on the data type. +#define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ + vllm::indexer_k_quant_and_cache_kernel \ + <<>>( \ + reinterpret_cast(k.data_ptr()), \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), head_dim, quant_block_size, \ + cache_block_size, cache_stride, use_ue8m0); + +void indexer_k_quant_and_cache( + torch::Tensor& k, // [num_tokens, head_dim] + torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride] + torch::Tensor& slot_mapping, // [num_tokens] + int64_t quant_block_size, // quantization block size + const std::string& scale_fmt) { + int num_tokens = k.size(0); + int head_dim = k.size(1); + int cache_block_size = kv_cache.size(1); + int cache_stride = kv_cache.size(2); + bool use_ue8m0 = scale_fmt == "ue8m0"; + + TORCH_CHECK(k.device() == kv_cache.device(), + "k and kv_cache must be on the same device"); + TORCH_CHECK(k.device() == slot_mapping.device(), + "k and slot_mapping must be on the same device"); + TORCH_CHECK(head_dim % quant_block_size == 0, + "head_dim must be divisible by quant_block_size"); + + constexpr int vec_size = 4; + dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) / + (quant_block_size * vec_size)); + dim3 block(32, vec_size); + const at::cuda::OptionalCUDAGuard device_guard(device_of(k)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3", + CALL_INDEXER_K_QUANT_AND_CACHE); +} + +// Macro to dispatch the kernel based on the data amount. +#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE) \ + vllm::cp_gather_indexer_k_quant_cache_kernel \ + <<>>( \ + reinterpret_cast(kv_cache.data_ptr()), \ + reinterpret_cast(dst_k.data_ptr()), \ + reinterpret_cast(dst_scale.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0), \ + kv_cache.stride(1), kv_cache.size(1), block_table.size(1), \ + num_tokens, quant_block_size); + +void cp_gather_indexer_k_quant_cache( + const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride] + torch::Tensor& dst_k, // [num_tokens, head_dim] + torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4] + const torch::Tensor& block_table, // [batch_size, num_blocks] + const torch::Tensor& cu_seq_lens // [batch_size + 1] +) { + int batch_size = block_table.size(0); + int num_tokens = dst_k.size(0); + int head_dim = dst_k.size(1); + int quant_block_size = head_dim * 4 / dst_scale.size(1); + + TORCH_CHECK(kv_cache.device() == dst_k.device(), + "kv_cache and dst_k must be on the same device"); + TORCH_CHECK(kv_cache.device() == dst_scale.device(), + "kv_cache and dst_scale must be on the same device"); + TORCH_CHECK(kv_cache.device() == block_table.device(), + "kv_cache and block_table must be on the same device"); + TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(), + "kv_cache and cu_seq_lens must be on the same device"); + TORCH_CHECK(head_dim % quant_block_size == 0, + "head_dim must be divisible by quant_block_size"); + + constexpr int vec_size = 16; + const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (num_tokens < 32) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1); + } else if (num_tokens < 64) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2); + } else if (num_tokens < 128) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4); + } else if (num_tokens < 256) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8); + } else if (num_tokens < 512) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16); + } else { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32); + } +} diff --git a/csrc/core/batch_invariant.hpp b/csrc/core/batch_invariant.hpp new file mode 100644 index 000000000000..fffe96b86857 --- /dev/null +++ b/csrc/core/batch_invariant.hpp @@ -0,0 +1,19 @@ +#pragma once +#include +#include +#include + +namespace vllm { + +// vllm_is_batch_invariant(); returns true +// if env VLLM_BATCH_INVARIANT=1 +inline bool vllm_is_batch_invariant() { + static bool cached = []() { + std::string env_key = "VLLM_BATCH_INVARIANT"; + const char* val = std::getenv(env_key.c_str()); + return (val && std::atoi(val) != 0) ? 1 : 0; + }(); + return cached; +} + +} // namespace vllm diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index d0f85e23609b..68a8750f583b 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f = ScalarType::float_(3, 2, true, ScalarType::NAN_NONE); static inline constexpr auto kFE4M3fn = ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN); +static inline constexpr auto kFE8M0fnu = + ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN); static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2); static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7); static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10); diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 17bbe04eef94..9cdcd2edacfd 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -14,7 +14,12 @@ // arm implementation #include "cpu_types_arm.hpp" #else - #warning "unsupported vLLM cpu implementation" + #warning "unsupported vLLM cpu implementation, vLLM will compile with scalar" + #include "cpu_types_scalar.hpp" +#endif + +#ifdef _OPENMP + #include #endif #endif \ No newline at end of file diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp new file mode 100644 index 000000000000..1a9278bc662e --- /dev/null +++ b/csrc/cpu/cpu_types_scalar.hpp @@ -0,0 +1,513 @@ +#include +#include +#include +#include +#include "float_convert.hpp" + +namespace vec_op { + +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) +#else + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +#define __max(a, b) ((a) > (b) ? (a) : (b)) +#define __min(a, b) ((a) < (b) ? (a) : (b)) +#define __abs(a) ((a) < (0) ? (0 - a) : (a)) + +typedef struct f16x8_t { + uint16_t val[8]; +} f16x8_t; + +typedef struct f16x16_t { + uint16_t val[16]; +} f16x16_t; + +typedef struct f16x32_t { + uint16_t val[32]; +} f16x32_t; + +typedef struct f32x4_t { + float val[4]; +} f32x4_t; + +typedef struct f32x8_t { + float val[8]; +} f32x8_t; + +typedef struct f32x16_t { + float val[16]; +} f32x16_t; + +namespace { +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +}; +}; // namespace + +template > > +constexpr void unroll_loop(F&& f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template +struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } +}; + +struct FP32Vec8; +struct FP32Vec16; + +struct FP16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + f16x8_t reg; + + explicit FP16Vec8(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit FP16Vec8(const FP32Vec8&); + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + f16x16_t reg; + + explicit FP16Vec16(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit FP16Vec16(const FP32Vec16&); + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } + + void save(void* ptr, const int elem_num) const { + int num = __min(elem_num, VEC_ELEM_NUM); + std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); + } +}; + +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + f16x8_t reg; + + explicit BF16Vec8(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec8(const FP32Vec8&); + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + f16x16_t reg; + + explicit BF16Vec16(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec16(const FP32Vec16&); + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } + + void save(void* ptr, const int elem_num) const { + int num = __min(elem_num, VEC_ELEM_NUM); + std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); + } +}; + +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + f16x32_t reg; + + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec32(f16x32_t data) : reg(data) {}; + + explicit BF16Vec32(BF16Vec8& vec8_data) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM]; + } + } + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + + f32x4_t reg; + + explicit FP32Vec4(float v) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = v; + } + } + + explicit FP32Vec4() { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = 0.0f; + } + } + + explicit FP32Vec4(const float* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit FP32Vec4(f32x4_t data) : reg(data) {}; + + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}; +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + f32x8_t reg; + + explicit FP32Vec8(float v) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = v; + } + } + + explicit FP32Vec8() { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = 0.0f; + } + } + + explicit FP32Vec8(const float* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit FP32Vec8(f32x8_t data) : reg(data) {}; + + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; + + explicit FP32Vec8(const FP16Vec8& v) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = fp16_to_float(v.reg.val[i]); + } + } + + FP32Vec8(const BF16Vec8& v) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = bf16_to_float(v.reg.val[i]); + } + } + + float reduce_sum() const { + float result = 0; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result += reg.val[i]; + } + return result; + } + + FP32Vec8 exp() const { + f32x8_t ret; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + ret.val[i] = expf(reg.val[i]); + } + return FP32Vec8(ret); + } + + FP32Vec8 tanh() const { + f32x8_t ret; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + ret.val[i] = tanhf(reg.val[i]); + } + return FP32Vec8(ret); + } + + FP32Vec8 er() const { + f32x8_t ret; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + ret.val[i] = erf(reg.val[i]); + } + return FP32Vec8(ret); + } + + FP32Vec8 operator*(const FP32Vec8& b) const { + f32x8_t ret; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + ret.val[i] = reg.val[i] * b.reg.val[i]; + } + return FP32Vec8(ret); + } + + FP32Vec8 operator+(const FP32Vec8& b) const { + f32x8_t ret; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + ret.val[i] = reg.val[i] + b.reg.val[i]; + } + return FP32Vec8(ret); + } + + FP32Vec8 operator-(const FP32Vec8& b) const { + f32x8_t ret; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + ret.val[i] = reg.val[i] - b.reg.val[i]; + } + return FP32Vec8(ret); + } + + FP32Vec8 operator/(const FP32Vec8& b) const { + f32x8_t ret; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + ret.val[i] = reg.val[i] / b.reg.val[i]; + } + return FP32Vec8(ret); + } + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + f32x16_t reg; + + explicit FP32Vec16(float v) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = v; + } + } + + explicit FP32Vec16() { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = 0.0f; + } + } + + explicit FP32Vec16(const float* ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit FP32Vec16(f32x16_t data) : reg(data) {}; + + FP32Vec16(const FP32Vec4& data) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM]; + } + } + + FP32Vec16(const FP32Vec8& data) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM]; + } + } + + FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}; + + explicit FP32Vec16(const FP16Vec16& v) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = fp16_to_float(v.reg.val[i]); + } + } + + explicit FP32Vec16(const BF16Vec16& v) { + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + reg.val[i] = bf16_to_float(v.reg.val[i]); + } + } + + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; + + FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; + + FP32Vec16 operator*(const FP32Vec16& b) const { + FP32Vec16 result(0.0f); + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result.reg.val[i] = reg.val[i] * b.reg.val[i]; + } + return result; + } + + FP32Vec16 operator+(const FP32Vec16& b) const { + FP32Vec16 result(0.0f); + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result.reg.val[i] = reg.val[i] + b.reg.val[i]; + } + return result; + } + + FP32Vec16 operator-(const FP32Vec16& b) const { + FP32Vec16 result(0.0f); + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result.reg.val[i] = reg.val[i] - b.reg.val[i]; + } + return result; + } + + FP32Vec16 operator/(const FP32Vec16& b) const { + FP32Vec16 result(0.0f); + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result.reg.val[i] = reg.val[i] / b.reg.val[i]; + } + return result; + } + + FP32Vec16 max(const FP32Vec16& b) const { + FP32Vec16 result(0.0f); + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result.reg.val[i] = __max(reg.val[i], b.reg.val[i]); + } + return result; + } + + FP32Vec16 min(const FP32Vec16& b) const { + FP32Vec16 result(0.0f); + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result.reg.val[i] = __min(reg.val[i], b.reg.val[i]); + } + return result; + } + + FP32Vec16 abs() const { + FP32Vec16 result(0.0f); + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result.reg.val[i] = __abs(reg.val[i]); + } + return result; + } + + float reduce_sum() const { + float result = 0.0f; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result += reg.val[i]; + } + return result; + } + + float reduce_max() const { + float result = reg.val[0]; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result = __max(reg.val[i], result); + } + return result; + } + + float reduce_min() const { + float result = reg.val[0]; + for (int i = 0; i < VEC_ELEM_NUM; ++i) { + result = __min(reg.val[i], result); + } + return result; + } + + template + float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + float sum = 0.0; + int start = idx * group_size; + int end = (idx + 1) * group_size; + + for (; (start < VEC_ELEM_NUM) && (start < end); ++start) { + sum += reg.val[start]; + } + + return sum; + } + + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +template +struct VecType { + using vec_type = void; +}; + +template +using vec_t = typename VecType::vec_type; + +template <> +struct VecType { + using vec_type = FP32Vec8; +}; + +template <> +struct VecType { + using vec_type = FP16Vec8; +}; + +template <> +struct VecType { + using vec_type = BF16Vec8; +}; + +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} + +/* +template <> inline void storeFP32(float v, c10::Half *ptr) { + c10::Half __attribute__((__may_alias__)) *v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} +*/ + +template <> +inline void storeFP32(float v, c10::Half* ptr) { + uint16_t fp16 = float_to_fp16(v); + *reinterpret_cast(ptr) = fp16; +} + +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} + +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + int i = 0; + for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) { + reg.val[i] = float_to_fp16(v.reg.val[i]); + } +} + +inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { + int i = 0; + for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) { + reg.val[i] = float_to_fp16(v.reg.val[i]); + } +} + +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { + acc = acc + a * b; +} + +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { + int i = 0; + for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) { + reg.val[i] = float_to_bf16(v.reg.val[i]); + } +} + +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { + int i = 0; + for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) { + reg.val[i] = float_to_bf16(v.reg.val[i]); + } +} + +inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); } + +}; // namespace vec_op diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp index ab8cbbbf4ec4..51bca37e699b 100644 --- a/csrc/cpu/cpu_types_vxe.hpp +++ b/csrc/cpu/cpu_types_vxe.hpp @@ -12,7 +12,7 @@ namespace vec_op { #define vec_sub(a, b) ((a) - (b)) #define vec_mul(a, b) ((a) * (b)) #define vec_div(a, b) ((a) / (b)) -#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic +#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left // FIXME: FP16 is not fully supported in Torch-CPU diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 3952c43cbc72..982f7c07a13b 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec { explicit FP16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec { explicit BF16Vec16(const FP32Vec16&); - void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } + void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 1)) {} void save(void* ptr) const { - *reinterpret_cast<__m256i*>(ptr) = reg_low; - *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; + _mm256_storeu_si256((__m256i*)ptr, reg_low); + _mm256_storeu_si256((__m256i*)ptr + 1, reg_high); } }; #endif diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp new file mode 100644 index 000000000000..0f0cc34602b3 --- /dev/null +++ b/csrc/cpu/dnnl_helper.cpp @@ -0,0 +1,581 @@ +#include +#include + +#include "common/memory_desc.hpp" +#include "common/memory.hpp" + +#include "dnnl_helper.h" + +static dnnl::engine& default_engine() { + static dnnl::engine engine(dnnl::engine::kind::cpu, 0); + return engine; +} + +static dnnl::stream& default_stream() { + static dnnl::stream stream(default_engine()); + return stream; +} + +void release_dnnl_matmul_handler(int64_t handler) { + DNNLMatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + delete ptr; +} + +DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) { + this->realloc(allocation_unit * 128); +} + +void DNNLScratchPadManager::realloc(size_t new_size) { + new_size = round(new_size); + if (new_size > size_) { + ptr_ = std::aligned_alloc(64, new_size); + size_ = new_size; + } +} + +DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() { + static DNNLScratchPadManager manager; + return &manager; +} + +template +class DNNLPrimitiveCache { + public: + using cache_value_t = std::pair; + using result_value_t = VT; + using container_t = std::list; + using value_iterator_t = typename container_t::iterator; + using map_t = std::unordered_map; + using creator_t = VT (*)(); + + public: + DNNLPrimitiveCache(size_t capacity) + : capacity_(capacity), + values_(), + key_to_value_(std::min(256lu, capacity)) { + assert(capacity > 0); + } + + template + result_value_t get_or_create(const KT& key, F&& creator) { + std::optional value = get_value(key); + if (value.has_value()) { + return value.value()->second; + } else { + return add_value({key, creator()})->second; + } + } + + size_t size() const { return values_.size(); } + + private: + void dump_data() { + std::stringstream ss; + ss << "table_id: " << std::hex << reinterpret_cast(this) << std::dec + << "\n"; + ss << "container: ["; + for (auto&& iter : values_) { + ss << "(" << iter.first << ", " << std::hex + << reinterpret_cast(iter.second.get()) << "), " << std::dec; + } + ss << "]\n"; + + ss << "map: ["; + for (auto&& iter : key_to_value_) { + ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex + << reinterpret_cast(iter.second->second.get()) << std::dec + << "), "; + } + ss << "]\n"; + std::printf("%s\n", ss.str().c_str()); + } + + value_iterator_t add_value(cache_value_t&& new_value) { + if (size() == capacity_) { + cache_value_t& last_item = values_.back(); + key_to_value_.erase(last_item.first); + values_.pop_back(); + } + + auto& added_value_ = values_.emplace_front(std::move(new_value)); + key_to_value_.emplace(added_value_.first, values_.begin()); + return values_.begin(); + } + + std::optional get_value(const KT& key) { + if (key_to_value_.size() > 0 && key == values_.begin()->first) { + return values_.begin(); + } + + auto value_map_iterator = key_to_value_.find(key); + if (value_map_iterator != key_to_value_.end()) { + values_.splice(values_.begin(), values_, value_map_iterator->second); + return value_map_iterator->second; + } else { + return {}; + } + } + + private: + const size_t capacity_; + container_t values_; + map_t key_to_value_; +}; + +DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler( + const Args& args, dnnl::memory::data_type b_type) + : b_n_size_(args.b_n_size), + b_n_stride_(args.b_n_stride), + b_k_size_(args.b_k_size), + b_k_stride_(args.b_k_stride), + b_type_(b_type), + c_type_(args.c_type), + runtime_memory_ptrs_(8), + primitive_cache_size_(args.primitive_cache_size) { + assert(primitive_cache_size_ > 0); +} + +void DNNLMatMulPrimitiveHandler::prepack_weight( + void* original_b_ptr, dnnl::memory::desc original_b_md, + dnnl::memory::desc b_target_mem_desc) { + dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr); + dnnl::memory packed_weight(b_target_mem_desc, default_engine()); + { + dnnl::reorder(original_weight, packed_weight) + .execute(default_stream(), original_weight, packed_weight); + default_stream().wait(); + } + memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight; + b_target_mem_desc_ = b_target_mem_desc; +} + +void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr( + size_t index, dnnl_memory* memory_ptr) { + dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage(); + dnnl_memory_desc* mem_desc = const_cast(memory_ptr->md()); + runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc}; +} + +std::pair +DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) { + return runtime_memory_ptrs_[index]; +} + +namespace std { +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const { + return hash()(val.b_n_size) ^ hash()(val.b_k_size) ^ + hash()(static_cast(val.a_qs)) ^ + hash()(static_cast(val.b_qs)) ^ hash()(val.use_azp) ^ + hash()(static_cast(val.c_type)); + } +}; + +template <> +struct hash { + size_t operator()( + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const { + return hash()(val.a_m_size) ^ hash()(val.use_bias) ^ + hash()(static_cast(val.bias_type)); + } +}; + +template <> +struct hash { + size_t operator()( + const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const { + return hash()(val.b_n_size) ^ hash()(val.b_k_size); + } +}; + +template <> +struct hash { + size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const { + return hash()(val.a_m_size) ^ + hash()(val.a_m_stride) ^ hash()(val.use_bias) ^ + hash()(static_cast(val.bias_type)); + } +}; +} // namespace std + +bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l, + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) { + return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size && + l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp && + l.c_type == r.c_type; +} + +bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l, + const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) { + return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size && + l.bias_type == r.bias_type; +} + +bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l, + const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) { + return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size; +} + +bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l, + const MatMulPrimitiveHandler::MSizeCacheKey& r) { + return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride && + l.use_bias == r.use_bias && l.bias_type == r.bias_type; +} + +static std::shared_ptr +get_w8a8_class_primitive_cache( + const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key, + int64_t cache_size) { + static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128); + assert(cache_size > 0); + return cache.get_or_create(key, [&]() { + return std::make_shared(cache_size); + }); +} + +W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args) + : DNNLMatMulPrimitiveHandler( + static_cast(args), + dnnl::memory::data_type::s8), + use_azp_(args.use_a_zero_point), + a_qs_(args.a_quantization_strategy), + b_qs_(args.b_quantization_strategy), + m_size_cache_(nullptr) { + assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL); + assert(b_qs_ != QuantizationStrategy::PER_TOKEN); + if (a_qs_ == QuantizationStrategy::PER_TOKEN) { + assert(!use_azp_); + }; + dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, + {b_k_stride_, b_n_stride_}); + prepack_weight(args.b_ptr, original_b_md, + create_primitive_desc( + MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, + .use_bias = false, + .bias_type = dnnl::memory::data_type::undef}, + true) + .weights_desc()); + init_runtime_memory_cache(args); +} + +void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) { + auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0); + auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1); + a_storage->set_data_handle((void*)args.a_ptr); + a_mem_desc->dims[0] = args.a_m_size; + c_storage->set_data_handle((void*)args.c_ptr); + c_mem_desc->dims[0] = args.a_m_size; + + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2); + a_scale_storage->set_data_handle((void*)args.a_scales_ptr); + } + if (use_azp_) { + auto&& [a_zero_point_storage, a_zero_point_mem_desc] = + get_runtime_memory_ptr(3); + a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr); + } + + if (args.use_bias) { + auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4); + bias_storage->set_data_handle((void*)args.bias_ptr); + } + + dnnl::matmul matmul = get_matmul_cache(args); + + auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5); + scratchpad_storage->set_data_handle( + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + + matmul.execute(default_stream(), memory_cache_); + default_stream().wait(); +} + +dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache( + const MSizeCacheKey& key) { + if (m_size_cache_.get() == nullptr) { + ClassMatmulCacheKey key = {.b_n_size = b_n_size_, + .b_k_size = b_k_size_, + .a_qs = a_qs_, + .b_qs = b_qs_, + .use_azp = use_azp_, + .c_type = c_type_}; + m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_); + } + + return m_size_cache_->get_or_create(key, [&]() { + dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + manager->realloc(desc.scratchpad_desc().get_size()); + return dnnl::matmul(desc); + }); +} + +void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { + memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get()); + memory_cache_[DNNL_ARG_DST] = + dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); + + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get()); + if (use_azp_) { + memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory( + {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr); + set_runtime_memory_ptr( + 3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get()); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), + (void*)args.b_scales_ptr); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), (void*)args.b_scales_ptr); + } + + memory_cache_[DNNL_ARG_BIAS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get()); + + memory_cache_[DNNL_ARG_SCRATCHPAD] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); +} + +dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( + const MSizeCacheKey& key, bool first_time) { + dnnl::memory::desc a_md({key.a_m_size, b_k_size_}, + dnnl::memory::data_type::s8, + dnnl::memory::format_tag::ab); + dnnl::memory::desc b_md; + if (first_time) { + b_md = + dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8, + dnnl::memory::format_tag::any); + } else { + b_md = b_target_mem_desc_; + } + dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attr; + + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + // For PER_TOKEN, scales will be applied in outside epilogue + if (a_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_SRC, 0); + if (use_azp_) { + attr.set_zero_points_mask(DNNL_ARG_SRC, 0); + } + } + + if (b_qs_ == QuantizationStrategy::PER_TENSOR) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); + } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) { + attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); + } + + if (key.use_bias) { + // For PER_TOKEN, bias will be applied in epilogue + assert(a_qs_ == QuantizationStrategy::PER_TENSOR); + dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, + c_md, attr); + } else { + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); + } +} + +MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args) + : DNNLMatMulPrimitiveHandler( + static_cast(args), args.ab_type), + m_size_cache_(nullptr) { + assert(ab_type_ == dnnl::memory::data_type::f32 || + ab_type_ == dnnl::memory::data_type::bf16 || + ab_type_ == dnnl::memory::data_type::f16); + + dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, + {b_k_stride_, b_n_stride_}); + + prepack_weight(args.b_ptr, original_b_md, + create_primitive_desc( + MSizeCacheKey{ +#ifdef VLLM_USE_ACL + // Arm Compute Library (ACL) backend for oneDNN does + // not support runtime + // dimensions, so we set M to a default value + .a_m_size = 128, + .a_m_stride = b_k_size_, +#else + .a_m_size = DNNL_RUNTIME_DIM_VAL, + .a_m_stride = DNNL_RUNTIME_DIM_VAL, +#endif + .use_bias = false, + .bias_type = dnnl::memory::data_type::undef}, + true) + .weights_desc()); + init_runtime_memory_cache(args); +} + +static std::shared_ptr +get_matul_class_primitive_cache( + const MatMulPrimitiveHandler::ClassMatmulCacheKey& key, + int64_t cache_size) { + static MatMulPrimitiveHandler::ClassMatmulCache cache(128); + assert(cache_size > 0); + return cache.get_or_create(key, [&]() { + return std::make_shared(cache_size); + }); +} + +void MatMulPrimitiveHandler::execute(ExecArgs& args) { + auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0); + auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1); + a_storage->set_data_handle((void*)args.a_ptr); + a_mem_desc->dims[0] = args.a_m_size; + a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride; + c_storage->set_data_handle((void*)args.c_ptr); + c_mem_desc->dims[0] = args.a_m_size; + +#ifndef VLLM_USE_ACL + // We do not support in ACL backend of oneDNN, we handle bias by: + // 1. copying it into the result tensor + // 2. attaching a fused-sum post-op to the matmul primitive + if (args.use_bias) { + auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2); + bias_storage->set_data_handle((void*)args.bias_ptr); + } +#endif + dnnl::matmul matmul = get_matmul_cache(args); + +// With ACL backend of oneDNN, the required memory format might change when the +// source tensor dims change. This does not really happen in practice, so isn't +// a performance hit, but we need to support it because the API allows for it. +#ifdef VLLM_USE_ACL + auto new_expected_wei_desc = + dnnl::matmul::primitive_desc( + const_cast(matmul.get_primitive_desc())) + .weights_desc(); + if (new_expected_wei_desc != b_target_mem_desc_) { + prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(), + b_target_mem_desc_, new_expected_wei_desc); + } +#endif + + auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3); + scratchpad_storage->set_data_handle( + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + + matmul.execute(default_stream(), memory_cache_); + default_stream().wait(); +} + +dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache( + const MSizeCacheKey& key) { + if (m_size_cache_.get() == nullptr) { + ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_}; + m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_); + } + return m_size_cache_->get_or_create(key, [&]() { + dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + manager->realloc(desc.scratchpad_desc().get_size()); + return dnnl::matmul(desc); + }); +} + +dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc( + const MSizeCacheKey& key, bool first_time) { + dnnl::memory::desc a_md; + dnnl::memory::desc b_md; + if (first_time) { + a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, + dnnl::memory::format_tag::ab); + b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_, + dnnl::memory::format_tag::any); + } else { + a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, + {key.a_m_stride, 1}); +#ifdef VLLM_USE_ACL + // ACL's backend of oneDNN always expects the weight format to be "any" + b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_, + dnnl::memory::format_tag::any); +#else + b_md = b_target_mem_desc_; +#endif + } + dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attr; + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + if (key.use_bias) { + dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); +// Since ACL's matmuls don't support passing a bias_md, we apply the bias +// through a fused-sum post-op +#ifdef VLLM_USE_ACL + dnnl::post_ops post_ops; + post_ops.append_sum(); + attr.set_post_ops(post_ops); + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); +#else + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, + c_md, attr); +#endif + } else { + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); + } +} + +void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { + memory_cache_[DNNL_ARG_SRC] = dnnl::memory( + {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr); + set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get()); + memory_cache_[DNNL_ARG_DST] = + dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); + +// ACL matmuls don't support bias_md, so we don't need these +#ifndef VLLM_USE_ACL + memory_cache_[DNNL_ARG_BIAS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get()); +#endif + memory_cache_[DNNL_ARG_SCRATCHPAD] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); +} + +bool is_onednn_acl_supported() { +#ifdef VLLM_USE_ACL + return true; +#else + return false; +#endif +} diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h new file mode 100644 index 000000000000..f0cb197d81a3 --- /dev/null +++ b/csrc/cpu/dnnl_helper.h @@ -0,0 +1,243 @@ +#ifndef DNNL_HELPER_H +#define DNNL_HELPER_H + +#include +#include + +#include "oneapi/dnnl/dnnl.hpp" + +namespace c10 { +struct BFloat16; +struct Half; +} // namespace c10 + +namespace dnnl { +namespace impl { +struct memory_storage_t; +struct matmul_pd_t; +struct matmul_desc_t; +} // namespace impl +} // namespace dnnl +struct dnnl_memory_desc; + +template +class DNNLPrimitiveCache; + +template +struct DNNLType { + static constexpr dnnl::memory::data_type type = + dnnl::memory::data_type::undef; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; +}; + +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + +template +constexpr inline dnnl::memory::data_type get_dnnl_type() { + return DNNLType>::type; +} + +class DNNLScratchPadManager { + public: + static constexpr size_t allocation_unit = 4 * 1024 * 1024; // 4KB + + static DNNLScratchPadManager* get_dnnl_scratchpad_manager(); + + DNNLScratchPadManager(); + + template + T* get_data() { + return reinterpret_cast(ptr_); + } + + static size_t round(size_t size) { + return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit; + } + + void realloc(size_t new_size); + + private: + size_t size_; + void* ptr_; +}; + +class DNNLMatMulPrimitiveHandler { + public: + virtual ~DNNLMatMulPrimitiveHandler() = default; + + protected: + struct Args { + dnnl_dim_t b_n_size; + dnnl_dim_t b_n_stride; + dnnl_dim_t b_k_size; + dnnl_dim_t b_k_stride; + void* b_ptr; + dnnl::memory::data_type c_type; + size_t primitive_cache_size; + }; + + protected: + DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type); + + void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md, + dnnl::memory::desc b_target_mem_desc); + + void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr); + + std::pair + get_runtime_memory_ptr(size_t index); + + protected: + const dnnl_dim_t b_n_size_; + const dnnl_dim_t b_n_stride_; + const dnnl_dim_t b_k_size_; + const dnnl_dim_t b_k_stride_; + dnnl::memory::data_type b_type_; + dnnl::memory::data_type c_type_; + std::unordered_map memory_cache_; + std::vector> + runtime_memory_ptrs_; + dnnl::memory::desc b_target_mem_desc_; + int64_t primitive_cache_size_; +}; + +class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { + public: + enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL }; + + struct Args : public DNNLMatMulPrimitiveHandler::Args { + bool use_a_zero_point; + QuantizationStrategy a_quantization_strategy; + QuantizationStrategy b_quantization_strategy; + float* b_scales_ptr; + }; + + struct ClassMatmulCacheKey { + dnnl_dim_t b_n_size; + dnnl_dim_t b_k_size; + QuantizationStrategy a_qs; + QuantizationStrategy b_qs; + bool use_azp; + dnnl::memory::data_type c_type; + + friend bool operator==(const ClassMatmulCacheKey& l, + const ClassMatmulCacheKey& r); + }; + + struct MSizeCacheKey { + dnnl_dim_t a_m_size; + bool use_bias; + dnnl::memory::data_type bias_type; + + friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r); + }; + + using MSizeCache = DNNLPrimitiveCache; + using ClassMatmulCache = + DNNLPrimitiveCache>; + + struct ExecArgs : public MSizeCacheKey { + const int8_t* a_ptr; + const float* a_scales_ptr; + const int32_t* a_zero_points_ptr; + const void* bias_ptr; + void* c_ptr; + }; + + public: + W8A8MatMulPrimitiveHandler(const Args& args); + + QuantizationStrategy get_input_scale_strategy() const { return a_qs_; } + + bool get_input_use_zero_point() const { return use_azp_; } + + void execute(ExecArgs& args); + + private: + dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key, + bool first_time); + + void init_runtime_memory_cache(const Args& args); + + dnnl::matmul get_matmul_cache(const MSizeCacheKey& key); + + private: + const bool use_azp_; + const QuantizationStrategy a_qs_; + const QuantizationStrategy b_qs_; + std::shared_ptr m_size_cache_; +}; + +class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { + public: + struct Args : public DNNLMatMulPrimitiveHandler::Args { + dnnl::memory::data_type ab_type; + }; + + struct ClassMatmulCacheKey { + dnnl_dim_t b_n_size; + dnnl_dim_t b_k_size; + + friend bool operator==(const ClassMatmulCacheKey& l, + const ClassMatmulCacheKey& r); + }; + + struct MSizeCacheKey { + dnnl_dim_t a_m_size; + dnnl_dim_t a_m_stride; + bool use_bias; + dnnl::memory::data_type bias_type; + + friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r); + }; + + using MSizeCache = DNNLPrimitiveCache; + using ClassMatmulCache = + DNNLPrimitiveCache>; + + struct ExecArgs : public MSizeCacheKey { + const void* a_ptr; + const void* bias_ptr; + void* c_ptr; + }; + + public: + MatMulPrimitiveHandler(const Args& args); + + void execute(ExecArgs& args); + + private: + dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key, + bool first_time); + + void init_runtime_memory_cache(const Args& args); + + dnnl::matmul get_matmul_cache(const MSizeCacheKey& key); + + private: + std::shared_ptr m_size_cache_; +}; + +#endif diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp deleted file mode 100644 index 1cb8dc5b25a6..000000000000 --- a/csrc/cpu/dnnl_helper.hpp +++ /dev/null @@ -1,206 +0,0 @@ -#ifndef DNNL_HELPER_HPP -#define DNNL_HELPER_HPP - -#include -#include - -#include "oneapi/dnnl/dnnl.hpp" - -namespace { -template -struct DNNLType { - static constexpr dnnl::memory::data_type type = - dnnl::memory::data_type::undef; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; -}; - -template <> -struct DNNLType { - static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; -}; - -template -constexpr inline dnnl::memory::data_type get_dnnl_type() { - return DNNLType>::type; -} -}; // namespace - -template -class DNNLPrimitiveHelper { - public: - // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias) - // A: [M, K], row-major - // B: [K, N], column-major - // C: [M, N], row-major - // bias: [N], row-major, optional - // a_scales: [MS] - // b_scales: [NS] - // Note: Due to the limitation of oneDNN - // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is - // not supported. - - template - static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c, - const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N, - dnnl_dim_t K, const float* a_scales, - const float* b_scales, dnnl_dim_t MS, - dnnl_dim_t NS) { - auto&& OutputType = get_dnnl_type(); - auto&& BiasType = get_dnnl_type(); - - dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1}); - dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K}); - dnnl::memory::desc c_md({M, N}, OutputType, {N, 1}); - - dnnl::primitive_attr attr; - if constexpr (!InputNoScale) { - if (MS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_SRC, 0); - } else { - // per-token - TORCH_CHECK(false, "per-token quantization is unsupported."); - } - } - - if (NS == 1) { - // per-tensor - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0); - } else { - // per-channel - attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2); - } - - dnnl::matmul::primitive_desc matmul_pd; -// Create memory descriptors with format_tag::any for the primitive. This -// enables the matmul primitive to choose memory layouts for an -// optimized primitive implementation, and these layouts may differ from the -// ones provided by the user. -#ifdef __aarch64__ - auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8, - dnnl::memory::format_tag::any); - auto mat_weights_md = dnnl::memory::desc( - {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any); - auto mat_dst_md = - dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any); - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md, - mat_weights_md, bias_md, - mat_dst_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc( - default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr); - } -#else - if (bias) { - dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1}); - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - bias_md, c_md, attr); - } else { - matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, - c_md, attr); - } -#endif - dnnl::matmul matmul(matmul_pd); - - auto& engine = default_engine(); - - dnnl::memory a_m(a_md, engine, (void*)a); - dnnl::memory b_m(b_md, engine, (void*)b); - dnnl::memory c_m(c_md, engine, (void*)c); - dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)a_scales); - dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine, - (void*)b_scales); - - auto& stream = default_stream(); - - auto mat_src_mem = a_m; - auto mat_weights_mem = b_m; - auto mat_dst_mem = c_m; -#ifdef __aarch64__ - if (matmul_pd.weights_desc() != b_m.get_desc()) { - mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine); - dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem); - } -#endif - if constexpr (InputNoScale) { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } else { - if (bias) { - dnnl::memory::desc bias_md({N}, BiasType, {1}); - dnnl::memory bias_m(bias_md, engine, (void*)bias); - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_BIAS, bias_m}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } else { - matmul.execute( - stream, { - {DNNL_ARG_SRC, mat_src_mem}, - {DNNL_ARG_WEIGHTS, mat_weights_mem}, - {DNNL_ARG_DST, mat_dst_mem}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m}, - {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m}, - }); - } - } - stream.wait(); - } - - private: - static dnnl::engine& default_engine() { - static dnnl::engine engine(dnnl::engine::kind::cpu, 0); - return engine; - } - - static dnnl::stream& default_stream() { - static dnnl::stream stream(default_engine()); - return stream; - } -}; -#endif diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp new file mode 100644 index 000000000000..6d062c71e767 --- /dev/null +++ b/csrc/cpu/dnnl_kernels.cpp @@ -0,0 +1,570 @@ +#include "cpu_types.hpp" +#include "dnnl_helper.h" + +namespace { +template +struct KernelVecType { + using load_vec_type = void; + using cvt_vec_type = void; +}; + +template <> +struct KernelVecType { + using load_vec_type = vec_op::FP32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) +template <> +struct KernelVecType { + using load_vec_type = vec_op::BF16Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; +#endif + +template <> +struct KernelVecType { +#if defined(__powerpc64__) || defined(__s390x__) + // Power architecture-specific vector type + using load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures + using load_vec_type = vec_op::FP16Vec16; +#endif + using cvt_vec_type = vec_op::FP32Vec16; +}; + +template +void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + const float* scale, const int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t inv_scale(1.0 / *scale); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + + cvt_vec_t zp_vec; + if constexpr (AZP) { + zp_vec = cvt_vec_t(static_cast(*azp)); + } + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } +} + +template +void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, + float* scale, int32_t* azp, + const int64_t num_tokens, + const int64_t input_stride, + const int64_t hidden_size) { + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + cvt_vec_t max_value(std::numeric_limits::lowest()); + cvt_vec_t min_value(std::numeric_limits::max()); + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + + if (j + vec_elem_num == hidden_size) { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } + } else { + if constexpr (AZP) { + max_value = max_value.max(elems_fp32, hidden_size - j); + min_value = min_value.min(elems_fp32, hidden_size - j); + } else { + max_value = max_value.max(elems_fp32.abs(), hidden_size - j); + } + } + } + + float scale_val; + float azp_val = 0.0f; + if constexpr (AZP) { + float max_scalar = max_value.reduce_max(); + float min_scalar = min_value.reduce_min(); + scale_val = (max_scalar - min_scalar) / 255.0f; + azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); + azp[i] = azp_val; + scale[i] = scale_val; + } else { + scale_val = max_value.reduce_max() / 127.0f; + scale[i] = scale_val; + } + + const cvt_vec_t inv_scale(1.0 / scale_val); + const cvt_vec_t azp_vec(azp_val); + + { + int64_t j = 0; + const scalar_t* input_ptr = input + i * input_stride; + int8_t* output_ptr = output + i * hidden_size; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j); + } + + load_vec_t elems(input_ptr + j); + cvt_vec_t elems_fp32(elems); + elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output_ptr + j, hidden_size - j); + } + } +} + +template +void dynamic_quant_epilogue(const float* input, scalar_t* output, + const float* a_scale, const int32_t* azp, + const float* azp_adj, const scalar_t* bias, + const int64_t num_tokens, + const int64_t hidden_size) { + CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) + using load_vec_t = typename KernelVecType::load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + const int64_t thread_num = omp_get_max_threads(); + if (num_tokens > thread_num) { +#pragma omp parallel for + for (int64_t i = 0; i < num_tokens; ++i) { + const float* input_ptr = input + i * hidden_size; + scalar_t* output_ptr = output + i * hidden_size; + int64_t j = 0; + cvt_vec_t token_scale_vec(a_scale[i]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[i] * static_cast(azp[i]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + for (; j < hidden_size - vec_elem_num; ++j) { + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j); + } + cvt_vec_t elems_fp32(input_ptr + j); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + j); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + j); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + j, hidden_size - j); + } + } else { + const int64_t vec_iteration = + (hidden_size + vec_elem_num - 1) / vec_elem_num; + const int64_t vec_iteration_per_thread = + (vec_iteration + thread_num - 1) / thread_num; + const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num; +#pragma omp parallel for schedule(static, 1) + for (int64_t i = 0; i < thread_num; ++i) { + const int64_t start = elem_num_per_thread * i; + const int64_t end = std::min(hidden_size, elem_num_per_thread + start); + for (int64_t j = 0; j < num_tokens; ++j) { + cvt_vec_t token_scale_vec(a_scale[j]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[j] * static_cast(azp[j]); + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + int64_t k = start; + const float* input_ptr = input + j * hidden_size; + scalar_t* output_ptr = output + j * hidden_size; + for (; k < end - vec_elem_num; k += vec_elem_num) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k); + } + if (k < end) { + cvt_vec_t elems_fp32(input_ptr + k); + elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + cvt_vec_t azp_adj_fp32(azp_adj + k); + elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec; + } + if constexpr (Bias) { + load_vec_t bias_vec(bias + k); + cvt_vec_t bias_vec_fp32(bias_vec); + elems_fp32 = elems_fp32 + bias_vec_fp32; + } + load_vec_t elems_out(elems_fp32); + elems_out.save(output_ptr + k, end - k); + } + } + } + } +} +} // namespace + +int64_t create_onednn_scaled_mm_handler( + const torch::Tensor& b, // [IC, OC], column-major + const torch::Tensor& b_scales, // [1] or [OC] + at::ScalarType output_type, bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size) { + TORCH_CHECK(b.dim() == 2); + TORCH_CHECK(b.stride(0) == 1); // Column-major + TORCH_CHECK(b_scales.is_contiguous()); + + W8A8MatMulPrimitiveHandler::Args args; + args.primitive_cache_size = primitive_cache_size; + + if (b_scales.numel() == 1) { + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + } else { + TORCH_CHECK_EQ(b_scales.numel(), b.size(1)); + args.b_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL; + } + args.b_scales_ptr = b_scales.data_ptr(); + args.b_k_size = b.size(0); + args.b_k_stride = b.stride(0); + args.b_n_size = b.size(1); + args.b_n_stride = b.stride(1); + args.b_ptr = b.data_ptr(); + + if (dynamic_act_quant) { + // dynamic per-token, bias, A scales and A zps will be applied in outside. + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN; + args.use_a_zero_point = false; + } else { + // static per-tensor + args.a_quantization_strategy = + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR; + args.use_a_zero_point = use_azp; + } + + VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler", + [&] { + if (dynamic_act_quant) { + args.c_type = get_dnnl_type(); + } else { + args.c_type = get_dnnl_type(); + } + }); + + return reinterpret_cast(new W8A8MatMulPrimitiveHandler(args)); +} + +void onednn_scaled_mm( + torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const torch::Tensor& a_scales, // [M] or [1] + const std::optional& azp, // [M] or [1] + const std::optional& azp_adj, // [M] or [1] + const std::optional& bias, // [N] + int64_t handler) { + CPU_KERNEL_GUARD_IN(onednn_scaled_mm) + TORCH_CHECK(a.dim() == 2); + TORCH_CHECK(a.is_contiguous()); + TORCH_CHECK(c.is_contiguous()); + W8A8MatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + const int32_t* azp_ptr = nullptr; + if (azp.has_value()) { + azp_ptr = azp->data_ptr(); + } + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + TORCH_CHECK_EQ(a_scales.numel(), 1); + } + + W8A8MatMulPrimitiveHandler::ExecArgs exec_args; + exec_args.a_ptr = a.data_ptr(); + exec_args.a_m_size = a.size(0); + exec_args.bias_ptr = nullptr; + exec_args.bias_type = get_dnnl_type(); + exec_args.use_bias = false; + exec_args.a_scales_ptr = nullptr; + exec_args.a_zero_points_ptr = nullptr; + + VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] { + if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) { + if (bias.has_value()) { + exec_args.bias_ptr = bias->data_ptr(); + exec_args.bias_type = get_dnnl_type(); + exec_args.use_bias = true; + } + exec_args.a_scales_ptr = a_scales.data_ptr(); + exec_args.a_zero_points_ptr = azp_ptr; + exec_args.c_ptr = c.data_ptr(); + ptr->execute(exec_args); + } else if (ptr->get_input_scale_strategy() == + W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) { + torch::Tensor tmp_fp32_out = + torch::empty_like(c, ::at::ScalarType::Float); + exec_args.c_ptr = tmp_fp32_out.data_ptr(); + ptr->execute(exec_args); + if (bias.has_value()) { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + bias->data_ptr(), c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, + bias->data_ptr(), c.size(0), c.size(1)); + } + } else { + if (azp.has_value()) { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(), + (scalar_t*)nullptr, c.size(0), c.size(1)); + } else { + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), azp_ptr, nullptr, (scalar_t*)nullptr, + c.size(0), c.size(1)); + } + } + } else { + TORCH_CHECK(false, "invalid act quant type."); + } + }); +} + +// static-per-tensor quantization. +void static_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + const torch::Tensor& scale, std::optional const& azp) { + CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + TORCH_CHECK(scale.numel() == 1); + TORCH_CHECK(!azp.has_value() || azp->numel() == 1); + + const int64_t stride = input.stride(0); + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "static_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + static_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + static_scaled_int8_quant_impl(input.data_ptr(), + out.data_ptr(), + scale.data_ptr(), nullptr, + num_tokens, stride, hidden_size); + } + }); +} + +// dynamic-per-token quantization. +void dynamic_scaled_int8_quant( + torch::Tensor& out, // [batch, hidden_size] + const torch::Tensor& input, // [batch, hidden_size] + torch::Tensor& scale, // [batch, 1] + std::optional const& azp) { + CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK_EQ(input.dim(), 2); + TORCH_CHECK_EQ(input.stride(1), 1); + + const int64_t hidden_size = input.size(1); + const int64_t num_tokens = input.size(0); + const int64_t stride = input.stride(0); + VLLM_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { + if (azp.has_value()) { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + stride, hidden_size); + } else { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), nullptr, num_tokens, stride, + hidden_size); + } + }); +} + +int64_t create_onednn_mm_handler(const torch::Tensor& b, + int64_t primitive_cache_size) { + TORCH_CHECK(b.dim() == 2); + + MatMulPrimitiveHandler::Args args; + args.primitive_cache_size = primitive_cache_size; + + args.b_k_size = b.size(0); + args.b_k_stride = b.stride(0); + args.b_n_size = b.size(1); + args.b_n_stride = b.stride(1); + args.b_ptr = b.data_ptr(); + + VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler", + [&] { + args.c_type = get_dnnl_type(); + args.ab_type = get_dnnl_type(); + }); + + return reinterpret_cast(new MatMulPrimitiveHandler(args)); +} + +void onednn_mm(torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const std::optional& bias, int64_t handler) { + CPU_KERNEL_GUARD_IN(onednn_mm) + TORCH_CHECK(a.dim() == 2); + TORCH_CHECK(a.stride(-1) == 1); + TORCH_CHECK(c.stride(-1) == 1); + MatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + +// ACL matmuls expect contiguous source tensors +#ifdef VLLM_USE_ACL + torch::Tensor a_contig = a.contiguous(); +#endif + + MatMulPrimitiveHandler::ExecArgs exec_args; + +#ifdef VLLM_USE_ACL + exec_args.a_m_size = a_contig.size(0); + exec_args.a_m_stride = a_contig.stride(0); +#else + exec_args.a_m_size = a.size(0); + exec_args.a_m_stride = a.stride(0); +#endif + VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] { + if (bias.has_value()) { + exec_args.use_bias = true; + exec_args.bias_type = get_dnnl_type(); +#ifdef VLLM_USE_ACL + // ACL matmuls in oneDNN do not support a bias. + // We handle a matmul with bias by doing: c = bias; c += matmul(a, b) + c.copy_(bias.value()); +#else + exec_args.bias_ptr = bias->data_ptr(); +#endif + } else { + exec_args.use_bias = false; + exec_args.bias_type = get_dnnl_type(); + exec_args.bias_ptr = nullptr; + } +#ifdef VLLM_USE_ACL + exec_args.a_ptr = a_contig.data_ptr(); +#else + exec_args.a_ptr = a.data_ptr(); + +#endif + exec_args.c_ptr = c.data_ptr(); + + ptr->execute(exec_args); + }); +} diff --git a/csrc/cpu/float_convert.hpp b/csrc/cpu/float_convert.hpp new file mode 100644 index 000000000000..c792bf131ccd --- /dev/null +++ b/csrc/cpu/float_convert.hpp @@ -0,0 +1,106 @@ + +static float bf16_to_float(uint16_t bf16) { + uint32_t bits = static_cast(bf16) << 16; + float fp32; + std::memcpy(&fp32, &bits, sizeof(fp32)); + return fp32; +} + +static uint16_t float_to_bf16(float fp32) { + uint32_t bits; + std::memcpy(&bits, &fp32, sizeof(fp32)); + return static_cast(bits >> 16); +} + +/************************************************ + * Copyright (c) 2015 Princeton Vision Group + * Licensed under the MIT license. + * Codes below copied from + * https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab + *************************************************/ +static uint16_t float_to_fp16(float fp32) { + uint16_t fp16; + + unsigned x; + unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1; + unsigned sign, exponent, mantissa; + + std::memcpy(&x, &fp32, sizeof(fp32)); + u = (x & 0x7fffffff); + + // Get rid of +NaN/-NaN case first. + if (u > 0x7f800000) { + fp16 = 0x7fffU; + return fp16; + } + + sign = ((x >> 16) & 0x8000); + + // Get rid of +Inf/-Inf, +0/-0. + if (u > 0x477fefff) { + fp16 = sign | 0x7c00U; + return fp16; + } + if (u < 0x33000001) { + fp16 = (sign | 0x0000); + return fp16; + } + + exponent = ((u >> 23) & 0xff); + mantissa = (u & 0x7fffff); + + if (exponent > 0x70) { + shift = 13; + exponent -= 0x70; + } else { + shift = 0x7e - exponent; + exponent = 0; + mantissa |= 0x800000; + } + lsb = (1 << shift); + lsb_s1 = (lsb >> 1); + lsb_m1 = (lsb - 1); + + // Round to nearest even. + remainder = (mantissa & lsb_m1); + mantissa >>= shift; + if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { + ++mantissa; + if (!(mantissa & 0x3ff)) { + ++exponent; + mantissa = 0; + } + } + + fp16 = (sign | (exponent << 10) | mantissa); + + return fp16; +} + +static float fp16_to_float(uint16_t fp16) { + unsigned sign = ((fp16 >> 15) & 1); + unsigned exponent = ((fp16 >> 10) & 0x1f); + unsigned mantissa = ((fp16 & 0x3ff) << 13); + int temp; + float fp32; + if (exponent == 0x1f) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); + exponent = 0xff; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71; + do { + msb = (mantissa & 0x400000); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70; + } + temp = ((sign << 31) | (exponent << 23) | mantissa); + std::memcpy(&fp32, &temp, sizeof(temp)); + return fp32; +} diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp deleted file mode 100644 index c1f7c64ea2f4..000000000000 --- a/csrc/cpu/quant.cpp +++ /dev/null @@ -1,949 +0,0 @@ -#include "cpu_types.hpp" -#include "dnnl_helper.hpp" - -namespace { -template -struct KernelVecType { - using load_vec_type = void; - using azp_adj_load_vec_type = void; - using cvt_vec_type = void; -}; - -template <> -struct KernelVecType { - using load_vec_type = vec_op::FP32Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -template <> -struct KernelVecType { - using load_vec_type = vec_op::BF16Vec16; - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -template <> -struct KernelVecType { -#if defined(__powerpc64__) || defined(__s390x__) - // Power architecture-specific vector type - using load_vec_type = vec_op::FP32Vec16; -#else - // Fallback for other architectures - using load_vec_type = vec_op::FP16Vec16; -#endif - using azp_adj_load_vec_type = vec_op::INT32Vec16; - using cvt_vec_type = vec_op::FP32Vec16; -}; - -#if defined(__AVX512F__) || defined(__aarch64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#elif defined(__powerpc64__) -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - - const cvt_vec_t inv_scale(1.0 / *scale); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - cvt_vec_t zp_vec; - if constexpr (AZP) { - zp_vec = cvt_vec_t(static_cast(*azp)); - } - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = elems_fp32 * inv_scale; - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + zp_vec; - } - - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - using load_vec_t = typename KernelVecType::load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - constexpr float i8_min = - static_cast(std::numeric_limits::min()); - constexpr float i8_max = - static_cast(std::numeric_limits::max()); - const cvt_vec_t i8_min_vec(i8_min); - const cvt_vec_t i8_max_vec(i8_max); - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_value(std::numeric_limits::lowest()); - cvt_vec_t min_value(std::numeric_limits::max()); - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - - if (j + vec_elem_num == hidden_size) { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32); - min_value = min_value.min(elems_fp32); - } else { - max_value = max_value.max(elems_fp32.abs()); - } - } else { - if constexpr (AZP) { - max_value = max_value.max(elems_fp32, hidden_size - j); - min_value = min_value.min(elems_fp32, hidden_size - j); - } else { - max_value = max_value.max(elems_fp32.abs(), hidden_size - j); - } - } - } - - float scale_val, azp_val; - if constexpr (AZP) { - float max_scalar = max_value.reduce_max(); - float min_scalar = min_value.reduce_min(); - scale_val = (max_scalar - min_scalar) / 255.0f; - azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); - azp[i] = static_cast(azp_val); - scale[i] = scale_val; - } else { - scale_val = max_value.reduce_max() / 127.0f; - scale[i] = scale_val; - } - - const cvt_vec_t inv_scale(1.0 / scale_val); - const cvt_vec_t azp_vec(azp_val); - - { - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j); - } - - load_vec_t elems(input + i * hidden_size + j); - cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale); - - if constexpr (AZP) { - elems_fp32 = elems_fp32 + azp_vec; - } - elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); - elems_int8.save(output + i * hidden_size + j, hidden_size - j); - } - } -} -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t a_scale_vec(a_scale); - cvt_vec_t b_scale_vec(*b_scale); - cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; - - int j = 0; - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - - if constexpr (PerChannel) { - b_scale_vec = cvt_vec_t(b_scale + j); - scale_vec = b_scale_vec * a_scale_vec; - } - - elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) - using load_vec_t = typename KernelVecType::load_vec_type; - using azp_adj_load_vec_t = - typename KernelVecType::azp_adj_load_vec_type; - using cvt_vec_t = typename KernelVecType::cvt_vec_type; - constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; - - #pragma omp parallel for - for (int i = 0; i < num_tokens; ++i) { - int j = 0; - cvt_vec_t token_scale_vec(a_scale[i]); - cvt_vec_t token_zp_scale_vec; - if constexpr (AZP) { - float zp_scale_val = a_scale[i] * static_cast(azp[i]); - if constexpr (!PerChannel) { - zp_scale_val *= *b_scale; - } - token_zp_scale_vec = cvt_vec_t(zp_scale_val); - } - - for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j); - } - - cvt_vec_t elems_fp32(input + i * hidden_size + j); - elems_fp32 = elems_fp32 * token_scale_vec; - - if constexpr (AZP) { - azp_adj_load_vec_t azp_adj_vec(azp_adj + j); - cvt_vec_t azp_adj_fp32(azp_adj_vec); - azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; - - if constexpr (PerChannel) { - cvt_vec_t b_scale_vec(b_scale + j); - azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; - } - - elems_fp32 = elems_fp32 - azp_adj_fp32; - } - - if constexpr (Bias) { - load_vec_t bias_vec(bias + j); - cvt_vec_t bias_vec_fp32(bias_vec); - elems_fp32 = elems_fp32 + bias_vec_fp32; - } - - load_vec_t elems_out(elems_fp32); - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } -} -#else -template -void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 " - "support.") -} - -template -void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, int32_t* azp, - const int num_tokens, - const int hidden_size) { - TORCH_CHECK(false, - "dynamic_scaled_int8_quant_impl requires " - "AVX512/powerpc64/AArch64 support.") -} - -template -void static_quant_epilogue(const float* input, scalar_t* output, - const float a_scale, const float* b_scale, - const int32_t* azp_with_adj, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} - -template -void dynamic_quant_epilogue(const float* input, scalar_t* output, - const float* a_scale, const float* b_scale, - const int32_t* azp, const int32_t* azp_with_adj, - const scalar_t* bias, const int num_tokens, - const int hidden_size) { - TORCH_CHECK( - false, - "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.") -} -#endif -} // namespace - -void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] { - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Ideally we want to fuse the GEMM and the scale procedure with oneDNN - // JIT, the intermediate data is cached in registers or L1. But for now - // the oneDNN GEMM code generation only supports two quantization - // patterns: per-tensor or per-output-channel of weight. - // So we have to apply the per-token scale with a 'epilogue'. In C=s_a * - // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN - // GEMM, then the per-token scale (and bias) is applied with the epilogue - // C=s_a * C_inter + bias. - torch::Tensor tmp_fp32_out = - torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - bias->data_ptr(), a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } else { - // Compute C=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), c.data_ptr(), - nullptr, a.size(0), b.size(1), a.size(1), - a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - } - }); -} - -void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, // [1] or [M] - const torch::Tensor& b_scales, // [1] or [OC] - const torch::Tensor& azp_adj, // [OC] - const std::optional& azp, // [1] or [M] - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_azp only supports INT8 inputs.") - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous()); - } - if (azp) { - TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous()); - } - TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous()); - - // azp & bias types - TORCH_CHECK(azp_adj.dtype() == torch::kInt32); - TORCH_CHECK(!azp || azp->dtype() == torch::kInt32); - TORCH_CHECK(!bias || bias->dtype() == c.dtype(), - "currently bias dtype must match output dtype ", c.dtype()); - - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - if (a_scales.numel() != 1) { - // per-token - // Note: oneDNN doesn't support per-token activation quantization - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), - bias->data_ptr(), c.size(0), c.size(1)); - } - } else { - // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } else { - // Per-Tensor - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), b_scales.data_ptr(), - azp->data_ptr(), azp_adj.data_ptr(), nullptr, - c.size(0), c.size(1)); - } - } - } else { - // per-tensor - if (bias.has_value()) { - // Compute C_inter=s_a * s_b * (A@B) + bias - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), bias->data_ptr(), - a.size(0), b.size(1), a.size(1), a_scales.data_ptr(), - b_scales.data_ptr(), a_scales.numel(), b_scales.numel()); - } else { - // Compute C_inter=s_a * s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), a_scales.data_ptr(), b_scales.data_ptr(), - a_scales.numel(), b_scales.numel()); - } - - // Compute C=C_inter - s_a * s_b * azp_adj - if (b_scales.numel() != 1) { - // Per-Channel - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } else { - // Per-Tensor - static_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - *a_scales.data_ptr(), b_scales.data_ptr(), - azp_adj.data_ptr(), a.size(0), b.size(1)); - } - } - }); -} - -// static-per-tensor quantization. -void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - const torch::Tensor& scale, - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scale.numel() == 1); - TORCH_CHECK(!azp.has_value() || azp->numel() == 1); - - const int hidden_size = input.size(-1); - const int num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "static_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -// dynamic-per-token quantization. -void dynamic_scaled_int8_quant( - torch::Tensor& out, // [..., hidden_size] - const torch::Tensor& input, // [..., hidden_size] - torch::Tensor& scale, // [..., 1] - std::optional const& azp) { - CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - - int const hidden_size = input.size(-1); - int const num_tokens = input.numel() / hidden_size; - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { - if (azp.has_value()) { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), num_tokens, - hidden_size); - } else { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), nullptr, num_tokens, hidden_size); - } - }); -} - -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, // [M, OC], row-major - const torch::Tensor& a, // [M, IC], row-major - const torch::Tensor& b, // [IC, OC], column-major - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias // [OC] -) { - CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) - // Checks for conformality - TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, - "int8_scaled_mm_ppc64le only supports INT8 inputs."); - TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); - TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && - b.size(1) == c.size(1)); - // We dont need this - TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); - TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); - - // Check for strides and alignment - TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major - TORCH_CHECK(b.stride(0) == 1); // Column-major - TORCH_CHECK(c.stride(0) % 16 == 0 && - b.stride(1) % 16 == 0); // 16 Byte Alignment - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - - if (bias) { - TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() && - bias->dim() == 1); - } - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] { - torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - // Compute C_inter=s_b * (A@B) - DNNLPrimitiveHelper::gemm_s8s8_jit( - a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), - a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); - if (bias.has_value()) { - // Compute C=s_a * C_inter + bias - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, - bias->data_ptr(), c.size(0), c.size(1)); - } else { - // Compute C=s_a * C_inter - dynamic_quant_epilogue( - tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, - c.size(0), c.size(1)); - } - }); -} - -#endif diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp index beeccff783ea..94b24c2f13a0 100644 --- a/csrc/cpu/sgl-kernels/moe.cpp +++ b/csrc/cpu/sgl-kernels/moe.cpp @@ -215,7 +215,7 @@ int moe_align_block_size( offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M); } }); - // TODO: do we need to vecterize this ? + // TODO: do we need to vectorize this ? for (int mb = 0; mb < num_token_blocks; ++mb) { offsets[mb + 1] += offsets[mb]; } diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index f1738aee980b..9df19d1ac392 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -6,25 +6,28 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); -void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); - -void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const torch::Tensor& azp_adj, - const std::optional& azp, - const std::optional& bias); - -#if defined(__powerpc64__) -void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a, - const torch::Tensor& b, - const torch::Tensor& a_scales, - const torch::Tensor& b_scales, - const std::optional& bias); -#endif +void release_dnnl_matmul_handler(int64_t handler); + +int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b, + const torch::Tensor& b_scales, + at::ScalarType output_type, + bool dynamic_act_quant, bool use_azp, + int64_t primitive_cache_size); + +void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a, + const torch::Tensor& a_scales, + const std::optional& azp, + const std::optional& azp_adj, + const std::optional& bias, + int64_t handler); + +int64_t create_onednn_mm_handler(const torch::Tensor& b, + int64_t primitive_cache_size); + +void onednn_mm(torch::Tensor& c, const torch::Tensor& a, + const std::optional& bias, int64_t handler); + +bool is_onednn_acl_supported(); void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, torch::Tensor& kv_cache, double scale, @@ -87,8 +90,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); + ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1); + ops.def( + "dynamic_4bit_int_moe(" + "Tensor x, Tensor topk_ids, Tensor topk_weights," + "Tensor w13_packed, Tensor w2_packed, int H, int I, int I2," + "int group_size, bool apply_router_weight_on_input, int activation_kind" + ") -> Tensor"); + + ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu); + // PagedAttention V2. ops.def( "paged_attention_v2(" @@ -151,67 +164,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); // Quantization -#if defined(__AVX512F__) || defined(__aarch64__) +#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ + defined(__powerpc64__) at::Tag stride_tag = at::Tag::needs_fixed_stride_order; + // Helper function to release oneDNN handlers + ops.def("release_dnnl_matmul_handler(int handler) -> ()", + &release_dnnl_matmul_handler); - // Compute int8 quantized tensor for given scaling factor. + // Create oneDNN GEMM handler ops.def( - "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()", - {stride_tag}); - ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); + "create_onednn_mm_handler(Tensor b, int " + "primitive_cache_size) -> int", + &create_onednn_mm_handler); - // Compute int8 quantized tensor and scaling factor + // oneDNN GEMM ops.def( - "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()", - {stride_tag}); - ops.impl("dynamic_scaled_int8_quant", torch::kCPU, - &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric per-tensor or per-row/column - // quantization. + "onednn_mm(Tensor! c, Tensor a, Tensor? bias, " + "int handler) -> ()"); + ops.impl("onednn_mm", torch::kCPU, &onednn_mm); + + // Check if oneDNN was built with ACL backend + ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported); + + // Create oneDNN W8A8 handler ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. + "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " + "output_type, bool dynamic_act_quant, bool use_azp, int " + "primitive_cache_size) -> int", + &create_onednn_scaled_mm_handler); + + // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()", - {stride_tag}); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); -#elif defined(__powerpc64__) + "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, " + "Tensor? azp_adj, Tensor? bias, int handler) -> ()"); + ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm); + // Compute int8 quantized tensor for given scaling factor. ops.def( "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()"); + "Tensor? azp) -> ()", + {stride_tag}); ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); // Compute int8 quantized tensor and scaling factor ops.def( "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()"); + "Tensor!? azp) -> ()", + {stride_tag}); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); - // W8A8 GEMM, supporting symmetric quantization. - ops.def( - "cutlass_scaled_mm(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le); - // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column - // quantization. - ops.def( - "cutlass_scaled_mm_azp(Tensor! out, Tensor a," - " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()"); - ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); #endif // SHM CCL diff --git a/csrc/cub_helpers.h b/csrc/cub_helpers.h new file mode 100644 index 000000000000..18e4e343ad8b --- /dev/null +++ b/csrc/cub_helpers.h @@ -0,0 +1,18 @@ +#pragma once + +#ifndef USE_ROCM + #include + #if CUB_VERSION >= 200800 + #include +using CubAddOp = cuda::std::plus<>; +using CubMaxOp = cuda::maximum<>; + #else // if CUB_VERSION < 200800 +using CubAddOp = cub::Sum; +using CubMaxOp = cub::Max; + #endif // CUB_VERSION +#else + #include +namespace cub = hipcub; +using CubAddOp = hipcub::Sum; +using CubMaxOp = hipcub::Max; +#endif // USE_ROCM diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h index affa051c7595..d7d589db62cf 100644 --- a/csrc/cuda_compat.h +++ b/csrc/cuda_compat.h @@ -4,8 +4,35 @@ #include #endif -#if defined(USE_ROCM) && defined(__GFX9__) - #define WARP_SIZE 64 +#ifdef USE_ROCM +struct Utils { + static __host__ int get_warp_size() { + static bool is_cached = false; + static int result; + + if (!is_cached) { + int device_id; + cudaDeviceProp deviceProp; + cudaGetDevice(&device_id); + cudaGetDeviceProperties(&deviceProp, device_id); + + result = deviceProp.warpSize; + is_cached = true; + } + + return result; + } + + static __device__ constexpr int get_warp_size() { + #ifdef __GFX9__ + return 64; + #else + return 32; + #endif + } +}; + + #define WARP_SIZE Utils::get_warp_size() #else #define WARP_SIZE 32 #endif diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 44709b459776..58926f6429dd 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16; #include #include #include +#include +#include namespace vllm { #define CUDACHECK(cmd) \ @@ -555,22 +557,47 @@ class CustomAllreduce { size /= d; auto bytes = size * sizeof(typename packed_t::P); int blocks = std::min(block_limit, (size + threads - 1) / threads); + + // Check environment variable once + const char* env_algo = std::getenv("VLLM_CUSTOM_ALLREDUCE_ALGO"); + bool force_1stage = false; + bool force_2stage = false; + if (env_algo != nullptr) { + if (std::strcmp(env_algo, "1stage") == 0 || + std::strcmp(env_algo, "oneshot") == 0) { + force_1stage = true; + } else if (std::strcmp(env_algo, "2stage") == 0 || + std::strcmp(env_algo, "twoshot") == 0) { + force_2stage = true; + } else { + throw std::runtime_error( + "Invalid VLLM_CUSTOM_ALLREDUCE_ALGO: " + std::string(env_algo) + + ". Valid values: 1stage, oneshot, 2stage, twoshot"); + } + } + #define KL(ngpus, name) \ name<<>>(ptrs, sg_, self_sg_, output, \ rank_, size); -#define REDUCE_CASE(ngpus) \ - case ngpus: { \ - if (world_size_ == 2) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else if (fully_connected_) { \ - if ((world_size_ <= 4 && bytes < 512 * 1024) || \ - (world_size_ <= 8 && bytes < 256 * 1024)) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else { \ - KL(ngpus, cross_device_reduce_2stage); \ - } \ - } \ - break; \ +#define REDUCE_CASE(ngpus) \ + case ngpus: { \ + if (force_1stage) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else if (force_2stage) { \ + KL(ngpus, cross_device_reduce_2stage); \ + } else { \ + if (world_size_ == 2) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else if (fully_connected_) { \ + if ((world_size_ <= 4 && bytes < 512 * 1024) || \ + (world_size_ <= 8 && bytes < 256 * 1024)) { \ + KL(ngpus, cross_device_reduce_1stage); \ + } else { \ + KL(ngpus, cross_device_reduce_2stage); \ + } \ + } \ + } \ + break; \ } switch (world_size_) { diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 195872e8edd3..f2c1dcf69f69 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -60,3 +60,13 @@ struct enable_sm100_only : Kernel { #endif } }; + +template +struct enable_sm120_only : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200 + Kernel::operator()(std::forward(args)...); +#endif + } +}; diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp deleted file mode 100644 index ec75c29e54f4..000000000000 --- a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp +++ /dev/null @@ -1,123 +0,0 @@ -// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl -// clang-format off -#pragma once - -#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl" - -#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp" - - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// GMMA_TMA_WS_SS (BlockScaled Builders) -template < - class ElementA, - class GmemLayoutATag, - int AlignmentA, - class ElementB, - class GmemLayoutBTag, - int AlignmentB, - class ElementAccumulator, - class TileShape_MNK, - class ClusterShape_MNK, - class StageCountType, - int ScaleGranularityM -> -struct CollectiveBuilder< - arch::Sm90, - arch::OpClassTensorOp, - ElementA, - GmemLayoutATag, - AlignmentA, - ElementB, - GmemLayoutBTag, - AlignmentB, - ElementAccumulator, - TileShape_MNK, - ClusterShape_MNK, - StageCountType, - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum, - cute::enable_if_t< - not detail::is_use_rmem_A()> -> { - using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum; - - static_assert(is_static::value); - static_assert(is_static::value); -#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED - static_assert(cutlass::detail::dependent_false, "Unsupported Toolkit for SM90 Collective Builder\n"); -#endif - static_assert(detail::is_aligned(), - "Should meet TMA alignment requirement\n"); - - static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v); - static constexpr bool IsFP8Input = detail::is_input_fp8(); - static_assert((!IsFP8Input || !IsArrayOfPointersGemm), - "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now."); - - // For fp32 types, map to tf32 MMA value type - using ElementAMma = cute::conditional_t, tfloat32_t, ElementA>; - using ElementBMma = cute::conditional_t, tfloat32_t, ElementB>; - - static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A(); - static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B(); - - static constexpr bool IsCooperative = cute::is_any_of_v>; - using AtomLayoutMNK = cute::conditional_t>, Layout>>; - - using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector< - ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{})); - - using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); - using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{}))); - - using SmemLayoutAtomA = decltype(detail::ss_smem_selector< - GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); - using SmemLayoutAtomB = decltype(detail::ss_smem_selector< - GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); - - static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; - static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage); - - static constexpr int PipelineStages = detail::compute_stage_count_or_override(StageCountType{}); - using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; - - using SmemCopyAtomA = void; - using SmemCopyAtomB = void; - - using CollectiveOp = CollectiveMma< - DispatchPolicy, - TileShape_MNK, - ElementA, - TagToStrideA_t, - ElementB, - TagToStrideB_t, - TiledMma, - GmemTiledCopyA, - SmemLayoutAtomA, - SmemCopyAtomA, - cute::identity, - GmemTiledCopyB, - SmemLayoutAtomB, - SmemCopyAtomB, - cute::identity - >; -}; - - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::collective - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp deleted file mode 100644 index 13b90e998625..000000000000 --- a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp +++ /dev/null @@ -1,183 +0,0 @@ -// clang-format off -// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp - -/*************************************************************************************************** - * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -#include "cute/algorithm/clear.hpp" -#include "cute/tensor.hpp" - -////////////////////////////////////////////////////////////////////////////// -///////////////////////////////////FP8 Accumulation/////////////////////////// -////////////////////////////////////////////////////////////////////////////// -/// This class provides API to promote (add) or scale (multiply_add) the results -/// from the tensor core accumulators to the main accumulators when the number -/// of MMAs reaches the max number of MMA interval specified by user, after that -/// the tensor core accumulators are zeroed. -////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { - -template < - class EngineAccum, - class LayoutAccum> -struct GmmaFP8AccumulationWithScale { - using TensorAccum = cute::Tensor; - using ElementAccumulator = typename EngineAccum::value_type; - - static_assert(is_static::value, "Accumulator Layout should be static"); - static_assert(is_rmem::value , "Accumulator tensor must be rmem resident."); - -private: - TensorAccum& accum_; - TensorAccum accum_temp_; - - uint32_t accum_promotion_interval_; // defines the max num of executed MMAs after which accum should be promoted. - uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop - uint32_t mma_count_; // current executed MMAs - uint32_t reset_accum_flag_; // accum needs to be zeroed or not. - - // promote or `add` the partial accumulators to main accumulator (FADD). - CUTLASS_DEVICE - void promote_core() { - warpgroup_wait<0>(); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(accum_); ++i) { - accum_(i) += accum_temp_(i); - } - } - - // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA). - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_core(const cute::Tensor &scale) { - using TensorScale = cute::Tensor; - - static_assert(is_static::value, "Scale Layout should be static"); - static_assert(is_rmem::value , "Scale tensor must be rmem resident."); - - static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape."); - - warpgroup_wait<0>(); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(accum_); ++i) { - accum_(i) += accum_temp_(i) * scale(i); - } - } - -public: - CUTLASS_DEVICE - GmmaFP8AccumulationWithScale( - TensorAccum &accum, - uint32_t accum_promotion_interval, - uint32_t mma_count_per_mainloop_iteration) - : accum_(accum), - accum_promotion_interval_(accum_promotion_interval), - mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration), - mma_count_(0), - reset_accum_flag_(0) - { - accum_temp_ = cute::make_fragment_like(accum); - } - - // - // Methods (Common) - // - - CUTLASS_DEVICE - TensorAccum& operator()() { - return accum_temp_; - } - - /// prepare the MMA accumulators when initialization or zeroing is required. - CUTLASS_DEVICE - bool prepare_if_needed() { - return reset_accum_flag_; - } - - // - // Methods (for FADD version) - // - - /// promote (add) the results from the MMA accumulators to main accumulator if needed. - CUTLASS_DEVICE - void promote_if_needed() { - mma_count_ += mma_count_per_mainloop_iteration_; - reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); - if (reset_accum_flag_) { - promote_core(); - mma_count_ = 0; - } - } - - /// promote (add) the residue results from the MMA accumulators to main accumulator if needed. - CUTLASS_DEVICE - void promote_residue_if_needed() { - if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { - promote_core(); - } - } - - // - // Methods (for FFMA version) - // - - /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed. - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_if_needed(const cute::Tensor &scale) { - mma_count_ += mma_count_per_mainloop_iteration_; - reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0); - if (reset_accum_flag_) { - scale_core(scale); - mma_count_ = 0; - } - } - - /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed. - template < - class EngineScale, - class LayoutScale> - CUTLASS_DEVICE - void scale_residue_if_needed(const cute::Tensor &scale) { - if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) { - scale_core(scale); - } - } -}; - -} // namespace cutlass::gemm::collective diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp deleted file mode 100644 index ce7f47cf7233..000000000000 --- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ /dev/null @@ -1,729 +0,0 @@ -// clang-format off -// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp - -/*************************************************************************************************** - * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -#include "cutlass/cutlass.h" -#include "cutlass/gemm/dispatch_policy.hpp" -#include "cutlass/trace.h" -#include "cutlass/numeric_types.h" - -#include "cute/arch/cluster_sm90.hpp" -#include "cute/arch/copy_sm80.hpp" -#include "cute/arch/copy_sm90.hpp" -#include "cute/algorithm/functional.hpp" -#include "cute/atom/mma_atom.hpp" -#include "cute/algorithm/gemm.hpp" -#include "cute/numeric/arithmetic_tuple.hpp" - -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { -using namespace cute; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// WarpSpecialized Mainloop -template < - int Stages, - class ClusterShape, - class KernelSchedule, - int ScaleGranularityM_, - class TileShape_, - class ElementA_, - class StrideA_, - class ElementB_, - class StrideB_, - class TiledMma_, - class GmemTiledCopyA_, - class SmemLayoutAtomA_, - class SmemCopyAtomA_, - class TransformA_, - class GmemTiledCopyB_, - class SmemLayoutAtomB_, - class SmemCopyAtomB_, - class TransformB_> -struct CollectiveMma< - MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8, - TileShape_, - ElementA_, - StrideA_, - ElementB_, - StrideB_, - TiledMma_, - GmemTiledCopyA_, - SmemLayoutAtomA_, - SmemCopyAtomA_, - TransformA_, - GmemTiledCopyB_, - SmemLayoutAtomB_, - SmemCopyAtomB_, - TransformB_> -{ - // - // Type Aliases - // - using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8; - using TileShape = TileShape_; - using ElementA = ElementA_; - using StrideA = StrideA_; - using ElementB = ElementB_; - using StrideB = StrideB_; - using TiledMma = TiledMma_; - using ElementAccumulator = typename TiledMma::ValTypeC; - using ElementBlockScale = ElementAccumulator; - using GmemTiledCopyA = GmemTiledCopyA_; - using GmemTiledCopyB = GmemTiledCopyB_; - using SmemLayoutAtomA = SmemLayoutAtomA_; - using SmemLayoutAtomB = SmemLayoutAtomB_; - using SmemCopyAtomA = SmemCopyAtomA_; - using SmemCopyAtomB = SmemCopyAtomB_; - using TransformA = TransformA_; - using TransformB = TransformB_; - using ArchTag = typename DispatchPolicy::ArchTag; - - using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{})); - using MainloopPipeline = cutlass::PipelineTmaAsync; - using PipelineState = cutlass::PipelineState; - using PipelineParams = typename MainloopPipeline::Params; - - // Two threads per CTA are producers (1 for operand tile and 32 for scales) - static constexpr int NumProducerThreadEvents = 33; - - static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_; - static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; - - static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - - static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape."); - - static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M."); - - // Tile along modes in a way that maximizes the TMA box size. - using SmemLayoutA = decltype(tile_to_shape( - SmemLayoutAtomA{}, - make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int{}), - cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); - using SmemLayoutB = decltype(tile_to_shape( - SmemLayoutAtomB{}, - make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), - cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); - - // Block scaling gmem-to-smem copy atom - using SmemBlockScalingCopyAtomA = Copy_Atom, ElementBlockScale>; - using SmemBlockScalingCopyAtomB = Copy_Atom, ElementBlockScale>; - - // Block scaling smem layout - using SmemLayoutScaleA = Layout, Int>>; - using SmemLayoutScaleB = Layout>, Stride<_1>>; // `ScaleNsPerTile` is always 1. - - static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more."); - static_assert(cute::is_base_of::value && - cute::is_base_of::value, - "MMA atom must source both A and B operand from smem_desc for this mainloop."); - static_assert(cute::is_same_v || cute::is_same_v, - "GmemTiledCopy - invalid SM90 TMA copy atom specified."); - static_assert(cute::is_same_v || cute::is_same_v, - "GmemTiledCopy - invalid SM90 TMA copy atom specified."); - static_assert(cute::is_same_v, - "ElementAccumulator and ElementBlockScale should be same datatype"); - - struct SharedStorage - { - struct TensorStorage : cute::aligned_struct<128> { - cute::array_aligned> smem_A; // mxk - cute::array_aligned> smem_B; // nxk - cute::array_aligned> smem_scale_A; // ScaleMsPerTile x k - cute::array_aligned> smem_scale_B; // 1xk - } tensors; - - using PipelineStorage = typename MainloopPipeline::SharedStorage; - PipelineStorage pipeline; - }; - using TensorStorage = typename SharedStorage::TensorStorage; - using PipelineStorage = typename SharedStorage::PipelineStorage; - - // Host side kernel arguments - struct Arguments { - ElementA const* ptr_A; - StrideA dA; - ElementB const* ptr_B; - StrideB dB; - ElementBlockScale const* ptr_scale_A; - ElementBlockScale const* ptr_scale_B; - }; - - // Device side kernel params - struct Params { - // Assumption: StrideA is congruent with Problem_MK - using TMA_A = decltype(make_tma_copy_A_sm90( - GmemTiledCopyA{}, - make_tensor(static_cast(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), - SmemLayoutA{}(_,_,0), - TileShape{}, - ClusterShape{})); - // Assumption: StrideB is congruent with Problem_NK - using TMA_B = decltype(make_tma_copy_B_sm90( - GmemTiledCopyB{}, - make_tensor(static_cast(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}), - SmemLayoutB{}(_,_,0), - TileShape{}, - ClusterShape{})); - TMA_A tma_load_a; - TMA_B tma_load_b; - uint32_t tma_transaction_bytes = TmaTransactionBytes; - uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK; - uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK; - // Block scaling factors for A and B - ElementBlockScale const* ptr_scale_A; - ElementBlockScale const* ptr_scale_B; - }; - - // - // Methods - // - - template - static constexpr Params - to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) { - (void) workspace; - - // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - auto ptr_A = reinterpret_cast(args.ptr_A); - auto ptr_B = reinterpret_cast(args.ptr_B); - - Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA)); - Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB)); - typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90( - GmemTiledCopyA{}, - tensor_a, - SmemLayoutA{}(_,_,cute::Int<0>{}), - TileShape{}, - ClusterShape{}); - typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90( - GmemTiledCopyB{}, - tensor_b, - SmemLayoutB{}(_,_,cute::Int<0>{}), - TileShape{}, - ClusterShape{}); - uint32_t transaction_bytes_mk = TmaTransactionBytesMK; - uint32_t transaction_bytes_nk = TmaTransactionBytesNK; - uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk; - - return { - tma_load_a, - tma_load_b, - transaction_bytes, - transaction_bytes_mk, - transaction_bytes_nk, - args.ptr_scale_A, - args.ptr_scale_B - }; - } - - template - static bool - can_implement( - ProblemShape const& problem_shape, - [[maybe_unused]] Arguments const& args) { - constexpr int tma_alignment_bits = 128; - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - bool implementable = true; - constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits::value; - implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), StrideA{}); - constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits::value; - implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(N,K,L), StrideB{}); - - if (!implementable) { - CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); - } - return implementable; - } - - static constexpr int K_PIPE_MAX = DispatchPolicy::Stages; - static constexpr int K_PIPE_MMAS = 1; - static constexpr uint32_t TmaTransactionBytesMK = - cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast(sizeof_bits::value)); - static constexpr uint32_t TmaTransactionBytesNK = - cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast(sizeof_bits::value)); - static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK; - - /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance - CUTLASS_DEVICE - static void prefetch_tma_descriptors(Params const& mainloop_params) - { - cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor()); - cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor()); - } - - /// Set up the data needed by this collective for load and mma. - /// Returns a tuple of tensors. The collective and the kernel layer have the contract - /// Returned tuple must contain at least two elements, with the first two elements being: - /// gA_mkl - The tma tensor, A after a local tile so it has shape (BLK_M,BLK_K,m,k,l) - /// gB_nkl - The tma tensor, B after a local tile so it has shape (BLK_N,BLK_K,n,k,l) - template - CUTLASS_DEVICE auto - load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const { - using X = Underscore; - // Separate out problem shape for convenience - auto [M,N,K,L] = problem_shape_MNKL; - - // TMA requires special handling of strides to deal with coord codomain mapping - // Represent the full tensors -- get these from TMA - Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L)); // (m,k,l) - Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L)); // (n,k,l) - - // Make tiled views, defer the slice - Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l) - Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l) - - constexpr auto scales_m = Int{}; - auto tM = get<2>(gA_mkl.shape()); - auto tN = get<2>(gB_nkl.shape()); - auto tK = get<3>(gA_mkl.shape()); - - // Make the tiled views of scale tensors - auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l) - auto scaleA_layout = make_ordered_layout(scaleA_shape, Step<_0, _1, _2>{}); - auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l) - auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{}); - - // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and - // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl. - Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l) - Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l) - - return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl); - } - - /// Perform a collective-scoped matrix multiply-accumulate - /// Producer Perspective - template < - class TensorA, class TensorB, - class TensorScaleA, class TensorScaleB, - class KTileIterator, class BlockCoord - > - CUTLASS_DEVICE void - load( - Params const& mainloop_params, - MainloopPipeline pipeline, - PipelineState smem_pipe_write, - cute::tuple const& load_inputs, - BlockCoord const& blk_coord, - KTileIterator k_tile_iter, int k_tile_count, - int thread_idx, - uint32_t block_rank_in_cluster, - TensorStorage& shared_tensors) { - int lane_predicate = cute::elect_one_sync(); - - // Blockscaling: Tma loads for load_input and CpAsync for load_scale - Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) - Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) - Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k) - Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) - - // - // Prepare the TMA loads for A and B - // - - constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); - uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; - - Tensor gA_mkl = get<0>(load_inputs); - Tensor gB_nkl = get<1>(load_inputs); - - auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y); - auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x); - - // Partition the inputs based on the current block coordinates. - auto [m_coord, n_coord, k_coord, l_coord] = blk_coord; - Tensor gA = gA_mkl(_,_,m_coord,_,l_coord); // (BLK_M,BLK_K,k) - Tensor gB = gB_nkl(_,_,n_coord,_,l_coord); // (BLK_N,BLK_K,k) - - - // Block scaling: load_scale has scaling tensors in global memory which are not tiled - Tensor mScaleA_mkl = get<2>(load_inputs); - Tensor mScaleB_nkl = get<3>(load_inputs); - auto scales_m = get<0>(mScaleA_mkl.shape()); - - Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape()); - - Tensor gScaleA = local_tile( - mScaleA_mkl, make_tile(Int{}), - make_coord(m_coord,_,l_coord)); // (ScaleMsPerTile,k,1) - Tensor cScaleA = local_tile( - cScaleA_mkl, make_tile(Int{}), - make_coord(m_coord,_,l_coord)); - Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord); // (1,k,1) - - // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128 - TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, - Layout>{}, Layout>{}); // (1,1,1) - TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, - Layout>{}, Layout>{}); // (1,1,1) - ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x); - ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x); - - Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA); - Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA); - Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA); - - Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB); - Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB); - - // Applies the mapping from block_tma_a - Tensor tAgA = block_tma_a.partition_S(gA); // (TMA,TMA_M,TMA_K,k) - Tensor tAsA = block_tma_a.partition_D(sA); // (TMA,TMA_M,TMA_K,PIPE) - - Tensor tBgB = block_tma_b.partition_S(gB); // (TMA,TMA_N,TMA_K,k) - Tensor tBsB = block_tma_b.partition_D(sB); // (TMA,TMA_N,TMA_K,PIPE) - - uint16_t mcast_mask_a = 0; - uint16_t mcast_mask_b = 0; - - // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors - // Maps the tile -> block, value - if constexpr (cute::is_same_v) { - auto block_layout = Layout{}; // (m,n) -> block_id - for (int n = 0; n < size<1>(block_layout); ++n) { - mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{})); - } - } - - if constexpr (cute::is_same_v) { - auto block_layout = Layout{}; // (m,n) -> block_id - for (int m = 0; m < size<0>(block_layout); ++m) { - mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{})); - } - } - - // Allocate predicate tensors for a_scales (since we can't guarantee that - // all scales are valid, since we could have a partial tiles along M) - Tensor tApA_ScaleA = make_tensor(shape(tAsA_ScaleA(_,_,0))); - #pragma unroll - for (int i = 0; i < size(tApA_ScaleA); ++i) { - tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m; - } - - // Mainloop - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) { - // LOCK smem_pipe_write for _writing_ - pipeline.producer_acquire(smem_pipe_write); - - // - // Copy gmem to smem for *k_tile_iter - // - int write_stage = smem_pipe_write.index(); - using BarrierType = typename MainloopPipeline::ProducerBarrierType; - BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write); - - // Copy operands A and B from global memory to shared memory - if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); - if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); - - // Copy scale tensors from global memory to shared memory - copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage)); - copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage)); - pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc); - - ++k_tile_iter; - - // Advance smem_pipe_write - ++smem_pipe_write; - } - } - - /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster - CUTLASS_DEVICE void - load_tail( - MainloopPipeline pipeline, - PipelineState smem_pipe_write) { - int lane_predicate = cute::elect_one_sync(); - - // Issue the epilogue waits - if (lane_predicate) { - /* This helps avoid early exit of blocks in Cluster - * Waits for all stages to either be released (all - * Consumer UNLOCKs), or if the stage was never used - * then would just be acquired since the phase was - * still inverted from make_producer_start_state - */ - pipeline.producer_tail(smem_pipe_write); - } - } - - /// Perform a collective-scoped matrix multiply-accumulate - /// Consumer Perspective - template < - class FrgTensorC - > - CUTLASS_DEVICE void - mma(MainloopPipeline pipeline, - PipelineState smem_pipe_read, - FrgTensorC& accum, - int k_tile_count, - int thread_idx, - TensorStorage& shared_tensors, - Params const& mainloop_params) { - - - static_assert(is_rmem::value, "C tensor must be rmem resident."); - static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3."); - static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3."); - static_assert(cute::is_void_v, - "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); - static_assert(cute::is_void_v, - "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions."); - - Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) - Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) - - // Block scaling - Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), - Layout< - Shape, Int>, cute::tuple_element_t<1, TileShape>, Int>, - Stride, _0, Int> - >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k) - Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k) - - // - // Define C accumulators and A/B partitioning - // - - // Layout of warp group to thread mapping - - static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and - stride<0>(typename TiledMma::BLayout{}) == 0 and - size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and - size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, - "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup"); - - constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup; - Layout warp_group_thread_layout = make_layout(Int{}, - Int{}); - - int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0); - - TiledMma tiled_mma; - auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx)); - - Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC); // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C. - - Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) - Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) - - // Allocate "fragments/descriptors" - Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) - Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) - - CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum)); // M - CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum)); // N - CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB)); // K - CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB)); // PIPE - CUTE_STATIC_ASSERT_V(Int{} == size<2>(sA)); // PIPE - CUTE_STATIC_ASSERT_V(Int{} == size<2>(sB)); // PIPE - - // - // PIPELINED MAIN LOOP - // - static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX), - "ERROR : Incorrect number of MMAs in flight"); - - // We release buffers to producer warps(dma load) with some mmas in flight - PipelineState smem_pipe_release = smem_pipe_read; - - // Per block scale values for operand A and B - - using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout. - using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above - - Tensor tCrScaleAViewAsC = make_tensor(RegLayoutScaleAViewAsC{}); // (MMA,MMA_M,MMA_N) - ElementBlockScale scale_b; - - // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - - GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA)); - warpgroup_fence_operand(accumulation()); - CUTLASS_PRAGMA_UNROLL - for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - - int read_stage = smem_pipe_read.index(); - - // Load per block scale values from shared memory to registers. - scale_b = sScaleB[read_stage]; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); - } - if constexpr (ScaleMsPerTile == 1) { - static_assert(size(RegLayoutScaleAEssential{}) == 1); - tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. - } else { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; - } - } - - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M,K) x (V,N,K) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - warpgroup_commit_batch(); - - // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` - accumulation.scale_if_needed(tCrScaleAViewAsC); - - ++smem_pipe_read; - } - - warpgroup_fence_operand(accumulation()); - // Mainloop GMMAs - k_tile_count -= prologue_mma_count; - - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - - // - // Compute on k_tile - // - - int read_stage = smem_pipe_read.index(); - - // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) - scale_b = sScaleB[read_stage]; - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{})); - } - if constexpr (ScaleMsPerTile == 1) { - static_assert(size(RegLayoutScaleAEssential{}) == 1); - tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`. - } else { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { - tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b; - } - } - - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - - warpgroup_fence_operand(accumulation()); - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M,K) x (V,N,K) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - warpgroup_commit_batch(); - - /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed - warpgroup_wait(); - warpgroup_fence_operand(accumulation()); - - // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` - accumulation.scale_if_needed(tCrScaleAViewAsC); - - pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it - - // Advance smem_pipe_read and smem_pipe_release - ++smem_pipe_read; - ++smem_pipe_release; - } - - accumulation.scale_residue_if_needed(tCrScaleAViewAsC); - - warpgroup_fence_operand(accumulation()); - } - - /// Perform a Consumer Epilogue to release all buffers - CUTLASS_DEVICE void - mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) { - // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - k_tile_count -= prologue_mma_count; - - smem_pipe_release.advance(k_tile_count); - - // Wait on all GMMAs to complete - warpgroup_wait<0>(); - - for (int count = 0; count < prologue_mma_count; ++count) { - pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it - ++smem_pipe_release; - } - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::collective - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp deleted file mode 100644 index df809e27a3ef..000000000000 --- a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "cutlass/gemm/dispatch_policy.hpp" - -namespace cutlass::gemm { - -////////////////////////////////////////////////////////////////////////////// - -// FP8 related policies (including Blocked Scaled Accumulation) -// `ScaleGranularityM` specifies scaling granularity along M, while zero-value -// `ScaleGranularityM` indicates that scaling granularity is -// `size<0>(TileShape_MNK{})` along M. -template -struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum - : KernelTmaWarpSpecializedCooperative {}; - -// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp -// specialized dynamic schedule For FP8 kernels with Block Scaling -template , - class KernelSchedule = KernelTmaWarpSpecialized, - int ScaleGranularityM = - 0 // `ScaleGranularityM` specifies scaling granularity along M, - // while zero-value `ScaleGranularityM` indicates that scaling - // granularity is `size<0>(TileShape_MNK{})` along M. - > -struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8 - : MainloopSm90TmaGmmaWarpSpecialized { - static_assert( - cute::is_same_v< - KernelSchedule, - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< - ScaleGranularityM>>, - "KernelSchedule must be one of the warp specialized policies"); -}; - -////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm \ No newline at end of file diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh index e7fbba4cd4b0..085ee1290031 100644 --- a/csrc/cutlass_extensions/vllm_collective_builder.cuh +++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh @@ -1,6 +1,6 @@ #pragma once -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" namespace cutlass::gemm::collective { using namespace cute; diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index 1dd7101acc27..34fb64c413db 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum -from typing import Union from cutlass_library import * @@ -22,31 +21,31 @@ class MixedInputKernelScheduleType(enum.Enum): TmaWarpSpecializedCooperative = enum_auto() -VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = { **DataTypeNames, # type: ignore **{ VLLMDataType.u4b8: "u4b8", VLLMDataType.u8b128: "u8b128", - } + }, } -VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = { **DataTypeTag, # type: ignore **{ VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t", - } + }, } -VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { +VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = { **DataTypeSize, # type: ignore **{ VLLMDataType.u4b8: 4, VLLMDataType.u8b128: 8, - } + }, } -VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u8b128: "vllm::kU8B128", DataType.u4: "vllm::kU4", @@ -57,7 +56,7 @@ class MixedInputKernelScheduleType(enum.Enum): DataType.bf16: "vllm::kBfloat16", } -VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = { DataType.u8: "at::ScalarType::Byte", DataType.s8: "at::ScalarType::Char", DataType.e4m3: "at::ScalarType::Float8_e4m3fn", @@ -67,15 +66,11 @@ class MixedInputKernelScheduleType(enum.Enum): DataType.f32: "at::ScalarType::Float", } -VLLMKernelScheduleTag: dict[Union[ - MixedInputKernelScheduleType, KernelScheduleType], str] = { - **KernelScheduleTag, # type: ignore - **{ - MixedInputKernelScheduleType.TmaWarpSpecialized: - "cutlass::gemm::KernelTmaWarpSpecialized", - MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: - "cutlass::gemm::KernelTmaWarpSpecializedPingpong", - MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: - "cutlass::gemm::KernelTmaWarpSpecializedCooperative", - } - } +VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = { + **KernelScheduleTag, # type: ignore + **{ + MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501 + MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong", # noqa: E501 + MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative", # noqa: E501 + }, +} diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index f7b75c48373f..995374a50b03 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -19,6 +19,13 @@ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) +#define VLLM_DISPATCH_CASE_HALF_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__)) + // ROCm devices might use either fn or fnuz, so set up dispatch table for both. // A host-based check at runtime will create a preferred FP8 type for ROCm // such that the correct kernel is dispatched. diff --git a/csrc/launch_bounds_utils.h b/csrc/launch_bounds_utils.h new file mode 100644 index 000000000000..92d7ef802f97 --- /dev/null +++ b/csrc/launch_bounds_utils.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include + +// maximum blocks per SM cap +#ifndef VLLM_LAUNCH_BLOCKS_CAP + #define VLLM_LAUNCH_BLOCKS_CAP 4 +#endif + +// Compile-time estimate of max threads per SM for launch bounds. +// Families: 1024, 1536, 2048 threads/SM. +#ifndef VLLM_MAX_THREADS_PER_SM + #ifdef __CUDA_ARCH__ + + /* 1024 thr/SM: Turing (sm_75) */ + #if (__CUDA_ARCH__ == 750) + #define VLLM_MAX_THREADS_PER_SM 1024 + + /* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89), + GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */ + #elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \ + (__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \ + (__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \ + (__CUDA_ARCH__ == 1210) + #define VLLM_MAX_THREADS_PER_SM 1536 + + /* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80), + Hopper (sm_90), Blackwell (sm_100/103) */ + #elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \ + (__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \ + (__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030) + #define VLLM_MAX_THREADS_PER_SM 2048 + + /* Fallback: use 2048 for unknown future CCs */ + #else + #define VLLM_MAX_THREADS_PER_SM 2048 + #endif + + #else + /* Host pass (no __CUDA_ARCH__): neutral default */ + #define VLLM_MAX_THREADS_PER_SM 2048 + #endif +#endif + +// compute the number of blocks per SM to request in __launch_bounds__ +#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL)) +#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \ + (((VAL) <= 0) \ + ? 1 \ + : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP)) +#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \ + VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS)) + +// runtime-time helper to compute blocks/SM +static inline int vllm_runtime_blocks_per_sm(int block_threads) { + int device = -1; + cudaGetDevice(&device); + int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM; + cudaDeviceGetAttribute(&max_threads_per_sm, + cudaDevAttrMaxThreadsPerMultiProcessor, device); + int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1; + return VLLM_CLAMP_BLOCKS_PER_SM(blocks); +} diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index d073dd6d2dee..8cfcf9f41283 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -1,35 +1,44 @@ #include "type_convert.cuh" #include "dispatch_utils.h" +#include "cub_helpers.h" +#include "core/batch_invariant.hpp" +#include "quantization/vectorization_utils.cuh" #include #include -#ifndef USE_ROCM - #include -#else - #include -#endif - namespace vllm { // TODO(woosuk): Further optimize this kernel. template __global__ void rms_norm_kernel( - scalar_t* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride, const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; - - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float)input[blockIdx.x * hidden_size + idx]; + const scalar_t* input_row = input + blockIdx.x * input_stride; + + constexpr int VEC_SIZE = 8; + auto vec_op = [&variance](const vec_n_t& vec) { +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + float x = static_cast(vec.val[i]); + variance += x * x; + } + }; + auto scalar_op = [&variance](const scalar_t& val) { + float x = static_cast(val); variance += x * x; - } + }; + vllm::vectorize_read_with_alignment( + input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op); using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -37,7 +46,7 @@ __global__ void rms_norm_kernel( __syncthreads(); for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float)input[blockIdx.x * hidden_size + idx]; + float x = (float)input[blockIdx.x * input_stride + idx]; out[blockIdx.x * hidden_size + idx] = ((scalar_t)(x * s_variance)) * weight[idx]; } @@ -50,7 +59,8 @@ __global__ void rms_norm_kernel( template __global__ std::enable_if_t<(width > 0) && _typeConvert::exists> fused_add_rms_norm_kernel( - scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { @@ -59,6 +69,7 @@ fused_add_rms_norm_kernel( static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); const int vec_hidden_size = hidden_size / width; + const int64_t vec_input_stride = input_stride / width; __shared__ float s_variance; float variance = 0.0f; /* These and the argument pointers are all declared `restrict` as they are @@ -73,7 +84,8 @@ fused_add_rms_norm_kernel( for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { int id = blockIdx.x * vec_hidden_size + idx; - _f16Vec temp = input_v[id]; + int64_t strided_id = blockIdx.x * vec_input_stride + idx; + _f16Vec temp = input_v[strided_id]; temp += residual_v[id]; variance += temp.sum_squares(); residual_v[id] = temp; @@ -81,7 +93,7 @@ fused_add_rms_norm_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -90,10 +102,11 @@ fused_add_rms_norm_kernel( for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { int id = blockIdx.x * vec_hidden_size + idx; + int64_t strided_id = blockIdx.x * vec_input_stride + idx; _f16Vec temp = residual_v[id]; temp *= s_variance; temp *= weight_v[idx]; - input_v[id] = temp; + input_v[strided_id] = temp; } } @@ -103,7 +116,8 @@ fused_add_rms_norm_kernel( template __global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> fused_add_rms_norm_kernel( - scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { @@ -111,7 +125,7 @@ fused_add_rms_norm_kernel( float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - scalar_t z = input[blockIdx.x * hidden_size + idx]; + scalar_t z = input[blockIdx.x * input_stride + idx]; z += residual[blockIdx.x * hidden_size + idx]; float x = (float)z; variance += x * x; @@ -120,7 +134,7 @@ fused_add_rms_norm_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -129,7 +143,7 @@ fused_add_rms_norm_kernel( for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { float x = (float)residual[blockIdx.x * hidden_size + idx]; - input[blockIdx.x * hidden_size + idx] = + input[blockIdx.x * input_stride + idx] = ((scalar_t)(x * s_variance)) * weight[idx]; } } @@ -141,38 +155,52 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] double epsilon) { TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(input.stride(-1) == 1); TORCH_CHECK(weight.is_contiguous()); int hidden_size = input.size(-1); - int num_tokens = input.numel() / hidden_size; + + // We cannot just use `input.stride(-2)` if the tensor is not row-major. + // Instead, we use a 2d view to get the second-innermost stride. + // That way the dimensions (except the last one) can be arbitrarily permuted. + torch::Tensor input_view = input.view({-1, hidden_size}); + + int num_tokens = input_view.numel() / hidden_size; + int64_t input_stride = input_view.stride(-2); dim3 grid(num_tokens); dim3 block(std::min(hidden_size, 1024)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { - vllm::rms_norm_kernel<<>>( - out.data_ptr(), input.data_ptr(), - weight.data_ptr(), epsilon, num_tokens, hidden_size); - }); + VLLM_DISPATCH_FLOATING_TYPES( + input_view.scalar_type(), "rms_norm_kernel", [&] { + vllm::rms_norm_kernel<<>>( + out.data_ptr(), input_view.data_ptr(), + input_stride, weight.data_ptr(), epsilon, num_tokens, + hidden_size); + }); } -#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ - vllm::fused_add_rms_norm_kernel \ - <<>>(input.data_ptr(), \ - residual.data_ptr(), \ - weight.data_ptr(), epsilon, \ - num_tokens, hidden_size); \ +#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ + vllm::fused_add_rms_norm_kernel \ + <<>>( \ + input.data_ptr(), input_stride, \ + residual.data_ptr(), weight.data_ptr(), \ + epsilon, num_tokens, hidden_size); \ }); void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size] torch::Tensor& residual, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] double epsilon) { + TORCH_CHECK(weight.scalar_type() == input.scalar_type()); + TORCH_CHECK(input.scalar_type() == residual.scalar_type()); + TORCH_CHECK(residual.is_contiguous()); + TORCH_CHECK(weight.is_contiguous()); int hidden_size = input.size(-1); + int64_t input_stride = input.stride(-2); int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); @@ -194,9 +222,18 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size] auto inp_ptr = reinterpret_cast(input.data_ptr()); auto res_ptr = reinterpret_cast(residual.data_ptr()); auto wt_ptr = reinterpret_cast(weight.data_ptr()); - bool ptrs_are_aligned = - inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; - if (ptrs_are_aligned && hidden_size % 8 == 0) { + constexpr int vector_width = 8; + constexpr int req_alignment_bytes = + vector_width * 2; // vector_width * sizeof(bfloat16 or float16) (float32 + // falls back to non-vectorized version anyway) + bool ptrs_are_aligned = inp_ptr % req_alignment_bytes == 0 && + res_ptr % req_alignment_bytes == 0 && + wt_ptr % req_alignment_bytes == 0; + bool offsets_are_multiple_of_vector_width = + hidden_size % vector_width == 0 && input_stride % vector_width == 0; + bool batch_invariant_launch = vllm::vllm_is_batch_invariant(); + if (ptrs_are_aligned && offsets_are_multiple_of_vector_width && + !batch_invariant_launch) { LAUNCH_FUSED_ADD_RMS_NORM(8); } else { LAUNCH_FUSED_ADD_RMS_NORM(0); diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu index d595b9e889c8..0f7f034ee180 100644 --- a/csrc/layernorm_quant_kernels.cu +++ b/csrc/layernorm_quant_kernels.cu @@ -6,39 +6,49 @@ */ #include "type_convert.cuh" -#include "quantization/fp8/common.cuh" +#include "quantization/w8a8/fp8/common.cuh" #include "dispatch_utils.h" +#include "cub_helpers.h" +#include "core/batch_invariant.hpp" +#include "quantization/vectorization_utils.cuh" #include #include -#ifndef USE_ROCM - #include -#else - #include -#endif - namespace vllm { // TODO(woosuk): Further optimize this kernel. template __global__ void rms_norm_static_fp8_quant_kernel( - fp8_type* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const int input_stride, const scalar_t* __restrict__ weight, // [hidden_size] const float* __restrict__ scale, // [1] const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float)input[blockIdx.x * hidden_size + idx]; + const scalar_t* input_row = input + blockIdx.x * input_stride; + + constexpr int VEC_SIZE = 8; + auto vec_op = [&variance](const vec_n_t& vec) { +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + float x = static_cast(vec.val[i]); + variance += x * x; + } + }; + auto scalar_op = [&variance](const scalar_t& val) { + float x = static_cast(val); variance += x * x; - } + }; + vllm::vectorize_read_with_alignment( + input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op); using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -49,7 +59,7 @@ __global__ void rms_norm_static_fp8_quant_kernel( float const scale_inv = 1.0f / *scale; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float)input[blockIdx.x * hidden_size + idx]; + float x = (float)input[blockIdx.x * input_stride + idx]; float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; out[blockIdx.x * hidden_size + idx] = scaled_fp8_conversion(out_norm, scale_inv); @@ -63,8 +73,9 @@ __global__ void rms_norm_static_fp8_quant_kernel( template __global__ std::enable_if_t<(width > 0) && _typeConvert::exists> fused_add_rms_norm_static_fp8_quant_kernel( - fp8_type* __restrict__ out, // [..., hidden_size] - scalar_t* __restrict__ input, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float* __restrict__ scale, // [1] @@ -74,6 +85,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); const int vec_hidden_size = hidden_size / width; + const int vec_input_stride = input_stride / width; __shared__ float s_variance; float variance = 0.0f; /* These and the argument pointers are all declared `restrict` as they are @@ -87,8 +99,9 @@ fused_add_rms_norm_static_fp8_quant_kernel( reinterpret_cast*>(weight); for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int stride_id = blockIdx.x * vec_input_stride + idx; int id = blockIdx.x * vec_hidden_size + idx; - _f16Vec temp = input_v[id]; + _f16Vec temp = input_v[stride_id]; temp += residual_v[id]; variance += temp.sum_squares(); residual_v[id] = temp; @@ -96,7 +109,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -125,8 +138,9 @@ fused_add_rms_norm_static_fp8_quant_kernel( template __global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> fused_add_rms_norm_static_fp8_quant_kernel( - fp8_type* __restrict__ out, // [..., hidden_size] - scalar_t* __restrict__ input, // [..., hidden_size] + fp8_type* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + const int input_stride, scalar_t* __restrict__ residual, // [..., hidden_size] const scalar_t* __restrict__ weight, // [hidden_size] const float* __restrict__ scale, // [1] @@ -135,7 +149,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( float variance = 0.0f; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - scalar_t z = input[blockIdx.x * hidden_size + idx]; + scalar_t z = input[blockIdx.x * input_stride + idx]; z += residual[blockIdx.x * hidden_size + idx]; float x = (float)z; variance += x * x; @@ -144,7 +158,7 @@ fused_add_rms_norm_static_fp8_quant_kernel( using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStore; - variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + variance = BlockReduce(reduceStore).Reduce(variance, CubAddOp{}, blockDim.x); if (threadIdx.x == 0) { s_variance = rsqrtf(variance / hidden_size + epsilon); @@ -169,7 +183,9 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] torch::Tensor& scale, // [1] double epsilon) { + TORCH_CHECK(out.is_contiguous()); int hidden_size = input.size(-1); + int input_stride = input.stride(-2); int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); @@ -183,8 +199,9 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] vllm::rms_norm_static_fp8_quant_kernel <<>>( out.data_ptr(), input.data_ptr(), - weight.data_ptr(), scale.data_ptr(), - epsilon, num_tokens, hidden_size); + input_stride, weight.data_ptr(), + scale.data_ptr(), epsilon, num_tokens, + hidden_size); }); }); } @@ -198,7 +215,7 @@ void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] width, fp8_t> \ <<>>( \ out.data_ptr(), input.data_ptr(), \ - residual.data_ptr(), \ + input_stride, residual.data_ptr(), \ weight.data_ptr(), scale.data_ptr(), \ epsilon, num_tokens, hidden_size); \ }); \ @@ -210,7 +227,12 @@ void fused_add_rms_norm_static_fp8_quant( torch::Tensor& weight, // [hidden_size] torch::Tensor& scale, // [1] double epsilon) { + TORCH_CHECK(out.is_contiguous()); + TORCH_CHECK(residual.is_contiguous()); + TORCH_CHECK(residual.scalar_type() == input.scalar_type()); + TORCH_CHECK(weight.scalar_type() == input.scalar_type()); int hidden_size = input.size(-1); + int input_stride = input.stride(-2); int num_tokens = input.numel() / hidden_size; dim3 grid(num_tokens); @@ -234,7 +256,9 @@ void fused_add_rms_norm_static_fp8_quant( auto wt_ptr = reinterpret_cast(weight.data_ptr()); bool ptrs_are_aligned = inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; - if (ptrs_are_aligned && hidden_size % 8 == 0) { + bool batch_invariant_launch = vllm::vllm_is_batch_invariant(); + if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 && + !batch_invariant_launch) { LAUNCH_FUSED_ADD_RMS_NORM(8); } else { LAUNCH_FUSED_ADD_RMS_NORM(0); diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h index 563d2fe4ef65..13c6178941cf 100644 --- a/csrc/mamba/mamba_ssm/selective_scan.h +++ b/csrc/mamba/mamba_ssm/selective_scan.h @@ -45,6 +45,9 @@ struct SSMParamsBase { index_t out_d_stride; index_t out_z_batch_stride; index_t out_z_d_stride; + index_t ssm_states_batch_stride; + index_t ssm_states_dim_stride; + index_t ssm_states_dstate_stride; // Common data pointers. void *__restrict__ A_ptr; diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index 5766fbab4e87..d534e138d26d 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -27,11 +27,12 @@ template + bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_> struct Selective_Scan_fwd_kernel_traits { static_assert(kNItems_ % 4 == 0); using input_t = input_t_; using weight_t = weight_t_; + using state_t = state_t_; static constexpr int kNThreads = kNThreads_; // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy. static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3; @@ -132,8 +133,10 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { input_t *Bvar = reinterpret_cast(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride; weight_t *C = reinterpret_cast(params.C_ptr) + dim_id * kNRows * params.C_d_stride; input_t *Cvar = reinterpret_cast(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride; - input_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate; - + typename Ktraits::state_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + + cache_index * params.ssm_states_batch_stride + + dim_id * kNRows * params.ssm_states_dim_stride; + float D_val[kNRows] = {0}; if (params.D_ptr != nullptr) { #pragma unroll @@ -248,7 +251,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { } // Initialize running total - scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0); + scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0); SSMScanPrefixCallbackOp prefix_op(running_prefix); typename Ktraits::BlockScanT(smem_scan).InclusiveScan( @@ -259,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { if (threadIdx.x == 0) { smem_running_prefix[state_idx] = prefix_op.running_prefix; if (chunk == n_chunks - 1) { - ssm_states[state_idx] = input_t(prefix_op.running_prefix.y); + ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y); } } #pragma unroll @@ -308,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { } } -template +template void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block // processing 1 row. @@ -319,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] { BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] { BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] { - using Ktraits = Selective_Scan_fwd_kernel_traits; + using Ktraits = Selective_Scan_fwd_kernel_traits; constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t); dim3 grid(params.batch, params.dim / kNRows); auto kernel = &selective_scan_fwd_kernel; @@ -339,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { }); } -template +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream) { #ifndef USE_ROCM if (params.seqlen <= 128) { - selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 256) { - selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 512) { - selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 1024) { - selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); } else { - selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream); } #else if (params.seqlen <= 256) { - selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 512) { - selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 1024) { - selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); } else { - selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream); } #endif } -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") -#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...) \ +#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...) \ if (ITYPE == at::ScalarType::Half) { \ using input_t = at::Half; \ using weight_t = float; \ - __VA_ARGS__(); \ + if (STYPE == at::ScalarType::Half) { \ + using state_t = at::Half; \ + __VA_ARGS__(); \ + } else if (STYPE == at::ScalarType::Float) { \ + using state_t = float; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \ + } \ } else if (ITYPE == at::ScalarType::BFloat16) { \ using input_t = at::BFloat16; \ using weight_t = float; \ - __VA_ARGS__(); \ + if (STYPE == at::ScalarType::BFloat16) { \ + using state_t = at::BFloat16; \ + __VA_ARGS__(); \ + } else if (STYPE == at::ScalarType::Float) { \ + using state_t = float; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \ + } \ } else if (ITYPE == at::ScalarType::Float) { \ using input_t = float; \ using weight_t = float; \ + using state_t = float; \ __VA_ARGS__(); \ } else { \ AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \ } -template +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); void set_ssm_params_fwd(SSMParamsBase ¶ms, @@ -481,6 +503,10 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, params.out_batch_stride = out.stride(1); params.out_d_stride = out.stride(0); + params.ssm_states_batch_stride = ssm_states.stride(0); + params.ssm_states_dim_stride = ssm_states.stride(1); + params.ssm_states_dstate_stride = ssm_states.stride(2); + } else{ if (!is_variable_B) { @@ -509,6 +535,10 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, } params.out_batch_stride = out.stride(0); params.out_d_stride = out.stride(1); + + params.ssm_states_batch_stride = ssm_states.stride(0); + params.ssm_states_dim_stride = ssm_states.stride(1); + params.ssm_states_dstate_stride = ssm_states.stride(2); } } @@ -638,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout at::Tensor out = delta; - TORCH_CHECK(ssm_states.scalar_type() == input_type); + // ssm_states can now be either the same as input_type or float32 + auto state_type = ssm_states.scalar_type(); + TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float); TORCH_CHECK(ssm_states.is_cuda()); TORCH_CHECK(ssm_states.stride(-1) == 1); @@ -660,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const at::cuda::OptionalCUDAGuard device_guard(device_of(u)); auto stream = at::cuda::getCurrentCUDAStream().stream(); - DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] { - selective_scan_fwd_cuda(params, stream); + DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] { + selective_scan_fwd_cuda(params, stream); }); } diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp new file mode 100644 index 000000000000..1d06fc6b5b0a --- /dev/null +++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp @@ -0,0 +1,156 @@ +#include +#include +#include + +// _dyn_quant_matmul_4bit is only available on AArch64. +#if defined(__aarch64__) + #include +#endif + +inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w, + int64_t group_size_eff, int64_t in_features, + int64_t out_features) { +#if defined(__aarch64__) + return at::_ops::_dyn_quant_matmul_4bit::call(a, packed_w, group_size_eff, + in_features, out_features); +#else + TORCH_CHECK(false, + "dynamic 4-bit int MoE path requires AArch64 (ARM64); " + "_dyn_quant_matmul_4bit is unavailable on this architecture"); + return {}; +#endif +} + +enum ActivationKind : int64_t { + SwiGLU_Gu = 0, // act = SiLU(g) * u + SwiGLUOAI = 1, // act = SiLU(u) * g + SiLU = 2 // SiLU +}; + +torch::Tensor dynamic_4bit_int_moe_cpu( + torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights, + torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I, + int64_t I2, int64_t group_size, bool apply_router_weight_on_input, + int64_t activation_kind) { + TORCH_CHECK(x.dim() == 2, "x must be 2D"); + TORCH_CHECK(topk_ids.dim() == 2 && topk_weights.dim() == 2, + "topk tensors must be [T, K]"); + TORCH_CHECK( + w13_packed.size(0) == w2_packed.size(0), + "w13_packed and w2_packed must have same number of experts in dim 0"); + TORCH_CHECK(I2 == 2 * I, "I2 must equal 2*I"); + + const int64_t T = x.size(0); + const int64_t K = topk_ids.size(1); + const int64_t E = w13_packed.size(0); + const int64_t N = T * K; + + auto x_c = x.contiguous(); + auto ids_c = topk_ids.contiguous(); + auto gates_c = topk_weights.to(at::kFloat).contiguous(); + + // bucketing tokens -> experts + c10::SmallVector counts( + E, 0); // Small vector uses stack allocation + { + const auto* ids_ptr = ids_c.data_ptr(); + for (int64_t i = 0; i < N; ++i) { + const int64_t e_id = ids_ptr[i]; + TORCH_CHECK(0 <= e_id && e_id < E, "expert id out of range"); + counts[e_id]++; + } + } + c10::SmallVector offsets(E + 1, 0); // ( E +1 ) + for (int64_t e = 0; e < E; ++e) offsets[e + 1] = offsets[e] + counts[e]; + + auto expert_tokens = at::empty({offsets[E]}, ids_c.options()); + auto expert_gates = at::empty({offsets[E]}, gates_c.options()); + { + c10::SmallVector cursor(E, 0); + const auto* ids_ptr = ids_c.data_ptr(); + const auto* gts_ptr = gates_c.data_ptr(); + auto* tok_ptr = expert_tokens.data_ptr(); + auto* gate_ptr = expert_gates.data_ptr(); + + for (int64_t t = 0; t < T; ++t) { + const int64_t base = t * K; + for (int64_t k = 0; k < K; ++k) { + const int64_t idx = base + k; + const int64_t e = ids_ptr[idx]; + const int64_t p = offsets[e] + (cursor[e]++); + tok_ptr[p] = t; + gate_ptr[p] = gts_ptr[idx]; + } + } + } + + const int64_t g_eff_13 = (group_size != -1) ? group_size : H; + const int64_t g_eff_2 = (group_size != -1) ? group_size : I; + + // Per-expert outputs filled in parallel + std::vector y_list(E); + y_list.resize(E); + + at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) { + for (int64_t e = e_begin; e < e_end; ++e) { + const int64_t te = counts[e]; + if (te == 0) { + y_list[e] = at::empty({0, H}, x_c.options()); + continue; + } + + const int64_t start = offsets[e]; + + auto sel_tokens = + expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); + auto gates_e = + expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); + + auto x_e = x_c.index_select(/*dim=*/0, sel_tokens); + + if (apply_router_weight_on_input) { + x_e = x_e.mul(gates_e.unsqueeze(1)); + } + + auto w13_e = w13_packed.select(/*dim=*/0, e); + auto w2_e = w2_packed.select(/*dim=*/0, e); + + // W13 + auto y13 = + mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2); + + auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I); + auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I); + + torch::Tensor act; + if (activation_kind == ActivationKind::SwiGLUOAI) { // SwiGLUOAI + constexpr double kAlpha = 1.702; // GPT-OSS default + constexpr double kLimit = 7.0; // GPT-OSS default + auto gate_c = at::clamp_max(g_part, kLimit); + auto up_c = at::clamp(u_part, -kLimit, kLimit); + auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha))); + act = up_c.add(1.0).mul(glu); + } else { // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul() + act = at::silu(g_part).mul(u_part); + } + + // W2 + auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H); + + if (!apply_router_weight_on_input) { + y = y.mul(gates_e.unsqueeze(1)); + } + + // Store per-expert result + y_list[e] = y; + } + }); + + // Concatenate all expert outputs to match expert_tokens order + auto Y_all = at::cat(y_list, /*dim=*/0); + auto out = at::zeros({T, H}, x.options()); + out = + at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all); + + return out; +} diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu new file mode 100644 index 000000000000..c93f9d54d780 --- /dev/null +++ b/csrc/moe/grouped_topk_kernels.cu @@ -0,0 +1,770 @@ +/* + * Adapted from + * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu + * Copyright (c) 2025, The vLLM team. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +namespace cg = cooperative_groups; + +namespace vllm { +namespace moe { + +constexpr unsigned FULL_WARP_MASK = 0xffffffff; +constexpr int32_t WARP_SIZE = 32; +constexpr int32_t BLOCK_SIZE = 512; +constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE; + +namespace warp_topk { + +template +__host__ __device__ constexpr T round_up_to_multiple_of(T len) { + if (len == 0) { + return 0; + } + return ((len - 1) / size + 1) * size; +} + +template +constexpr __host__ __device__ bool isPowerOf2(T v) { + return (v && !(v & (v - 1))); +} + +template +__forceinline__ __device__ bool is_better_than(T val, T baseline) { + return (val > baseline && greater) || (val < baseline && !greater); +} + +template +__forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index, + idxT baseline_index) { + bool res = (val > baseline && greater) || (val < baseline && !greater); + if (val == baseline) { + res = (index < baseline_index && greater) || + (index < baseline_index && !greater); + } + return res; +} + +template +int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) { + int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k; + int64_t n = std::max(num_of_warp / 2 * k, num_of_warp * WARP_SIZE); + return max(cache_topk, + round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT)); +} + +template +struct BitonicMerge { + // input should be a bitonic sequence, and sort it to be a monotonic sequence + __device__ static void merge(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + static_assert(isPowerOf2(size)); + static_assert(size >= 2 * WARP_SIZE); + constexpr int arr_len = size / WARP_SIZE; + + constexpr int stride = arr_len / 2; + for (int i = 0; i < stride; ++i) { + int const other_i = i + stride; + T& val = val_arr[i]; + T& other_val = val_arr[other_i]; + bool is_better; + if constexpr (is_stable) { + is_better = is_better_than(val, other_val, idx_arr[i], + idx_arr[other_i]); + } else { + is_better = is_better_than(val, other_val); + } + + if (is_better) { + T tmp = val; + val = other_val; + other_val = tmp; + + idxT tmp2 = idx_arr[i]; + idx_arr[i] = idx_arr[other_i]; + idx_arr[other_i] = tmp2; + } + } + + BitonicMerge::merge( + val_arr, idx_arr); + BitonicMerge::merge( + val_arr + arr_len / 2, idx_arr + arr_len / 2); + } +}; + +template +struct BitonicSort { + __device__ static void sort(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + static_assert(isPowerOf2(size)); + static_assert(size >= 2 * WARP_SIZE); + constexpr int arr_len = size / WARP_SIZE; + + BitonicSort::sort(val_arr, idx_arr); + BitonicSort::sort( + val_arr + arr_len / 2, idx_arr + arr_len / 2); + BitonicMerge::merge( + val_arr, idx_arr); + } +}; + +template +struct BitonicSort<32, ascending, T, idxT, is_stable> { + __device__ static void sort(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + int const lane = threadIdx.x % WARP_SIZE; + + // ascending doesn't matter before merging since all we need is a bitonic + // sequence + for (int stage = 0; stage < 4; ++stage) { + for (int stride = (1 << stage); stride > 0; stride /= 2) { + bool reverse = (lane >> stage) & 2; + bool is_second = lane & stride; + + T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride); + idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride); + + bool is_better; + if constexpr (is_stable) { + if constexpr (ascending) { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr < other_idx))) != + (reverse != is_second); + } else { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr > other_idx))) != + (reverse != is_second); + } + } else { + is_better = (*val_arr != other && + (*val_arr > other) != (reverse != is_second)); + } + if (is_better) { + *val_arr = other; + *idx_arr = other_idx; + } + } + } + + BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr, + idx_arr); + } +}; + +template +struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> { + __device__ static void merge(T* __restrict__ val_arr, + idxT* __restrict__ idx_arr) { + int const lane = threadIdx.x % WARP_SIZE; + for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) { + bool is_second = lane & stride; + T& val = *val_arr; + T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride); + idxT& idx = *idx_arr; + idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride); + + bool is_better; + if constexpr (is_stable) { + if constexpr (ascending) { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr < other_idx))) == + (reverse != is_second); // for min + } else { + is_better = ((*val_arr > other) || + ((*val_arr == other) && (*idx_arr > other_idx))) == + (reverse != is_second); // for max + } + } else { + is_better = + (val != other && ((val > other) == (ascending != is_second))); + } + + if (is_better) { + val = other; + idx = other_idx; + } + } + } +}; + +template +class WarpSort { + public: + __device__ WarpSort(idxT k, T dummy) + : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) { + static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity)); + + for (int i = 0; i < max_arr_len_; ++i) { + val_arr_[i] = dummy_; + idx_arr_[i] = 0; + } + } + + // load and merge k sorted values + __device__ void load_sorted(T const* __restrict__ in, + idxT const* __restrict__ in_idx, idxT start) { + idxT idx = start + WARP_SIZE - 1 - lane_; + for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) { + if (idx < start + k_) { + T t = in[idx]; + bool is_better; + if constexpr (is_stable) { + is_better = + is_better_than(t, val_arr_[i], in_idx[idx], idx_arr_[i]); + } else { + is_better = is_better_than(t, val_arr_[i]); + } + if (is_better) { + val_arr_[i] = t; + idx_arr_[i] = in_idx[idx]; + } + } + } + + BitonicMerge::merge( + val_arr_, idx_arr_); + } + + __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const { + for (int i = 0; i < max_arr_len_; ++i) { + idxT out_i = i * WARP_SIZE + lane_; + if (out_i < k_) { + out[out_i] = val_arr_[i]; + out_idx[out_i] = idx_arr_[i]; + } + } + } + + __device__ void dumpIdx(idxT* __restrict__ out_idx) const { + for (int i = 0; i < max_arr_len_; ++i) { + idxT out_i = i * WARP_SIZE + lane_; + if (out_i < k_) { + out_idx[out_i] = idx_arr_[i]; + } + } + } + + protected: + static constexpr int max_arr_len_ = capacity / WARP_SIZE; + + T val_arr_[max_arr_len_]; + idxT idx_arr_[max_arr_len_]; + + int const lane_; + idxT const k_; + T const dummy_; + +}; // end class WarpSort + +template +class WarpSelect : public WarpSort { + public: + __device__ WarpSelect(idxT k, T dummy) + : WarpSort(k, dummy), + k_th_(dummy), + k_th_lane_((k - 1) % WARP_SIZE) { + extern __shared__ char smem_buf[]; // extern __shared__ T smem_buf[]; + + int const num_of_warp = blockDim.x / WARP_SIZE; + int const warp_id = threadIdx.x / WARP_SIZE; + val_smem_ = reinterpret_cast(smem_buf); + val_smem_ += warp_id * WARP_SIZE; + idx_smem_ = reinterpret_cast( + smem_buf + + round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE)); + idx_smem_ += warp_id * WARP_SIZE; + } + + __device__ void add(T const* in, idxT start, idxT end) { + idxT const end_for_fullwarp = + round_up_to_multiple_of(end - start) + start; + for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) { + T val = (i < end) ? in[i] : dummy_; + add(val, i); + } + } + + __device__ void add(T val, idxT idx) { + bool do_add; + if constexpr (is_stable) { + do_add = is_better_than(val, k_th_, idx, k_th_idx_); + } else { + do_add = is_better_than(val, k_th_); + } + + uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add); + if (mask == 0) { + return; + } + + int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1)); + if (do_add && pos < WARP_SIZE) { + val_smem_[pos] = val; + idx_smem_[pos] = idx; + do_add = false; + } + smem_buf_len_ += __popc(mask); + if (smem_buf_len_ >= WARP_SIZE) { + __syncwarp(); + merge_buf_(val_smem_[lane_], idx_smem_[lane_]); + smem_buf_len_ -= WARP_SIZE; + } + if (do_add) { + pos -= WARP_SIZE; + val_smem_[pos] = val; + idx_smem_[pos] = idx; + } + __syncwarp(); + } + + __device__ void done() { + if (smem_buf_len_) { + T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_; + idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0; + merge_buf_(val, idx); + } + + // after done(), smem is used for merging results among warps + __syncthreads(); + } + + private: + __device__ void set_k_th_() { + k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_); + if constexpr (is_stable) { + k_th_idx_ = + __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_); + } + } + + __device__ void merge_buf_(T val, idxT idx) { + BitonicSort::sort(&val, &idx); + + T& old = val_arr_[max_arr_len_ - 1]; + + bool is_better; + if constexpr (is_stable) { + is_better = + is_better_than(val, old, idx, idx_arr_[max_arr_len_ - 1]); + } else { + is_better = is_better_than(val, old); + } + + if (is_better) { + old = val; + idx_arr_[max_arr_len_ - 1] = idx; + } + + BitonicMerge::merge( + val_arr_, idx_arr_); + + set_k_th_(); + } + + using WarpSort::max_arr_len_; + using WarpSort::val_arr_; + using WarpSort::idx_arr_; + using WarpSort::lane_; + using WarpSort::k_; + using WarpSort::dummy_; + + T* val_smem_; + idxT* idx_smem_; + int smem_buf_len_ = 0; + + T k_th_; + idxT k_th_idx_; + int const k_th_lane_; +}; // end class WarpSelect +} // namespace warp_topk + +template +__device__ inline T_OUT cuda_cast(T_IN val) { + return val; +} + +template <> +__device__ inline float cuda_cast(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template +__device__ inline T neg_inf() { + // cuda::std::numeric_limits::infinity() returns `0` for [T=bf16 or fp16] + // so we need to cast from fp32 + return cuda_cast(-cuda::std::numeric_limits::infinity()); +} + +template +__device__ inline bool is_finite(const T val) { +#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800) + return cuda::std::isfinite(val); +#else + return isfinite(cuda_cast(val)); +#endif +} + +template +__device__ void topk_with_k2(T* output, T const* input, + cg::thread_block_tile<32> const& tile, + int32_t const lane_id, + int const num_experts_per_group) { + // Get the top2 per thread + T largest = neg_inf(); + T second_largest = neg_inf(); + + if (num_experts_per_group > WARP_SIZE) { + for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { + T value = input[i]; + if (value > largest) { + second_largest = largest; + largest = value; + } else if (value > second_largest) { + second_largest = value; + } + } + } else { + for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { + largest = input[i]; + } + } + + __syncwarp(); // Ensure all threads have valid data before reduction + // Get the top2 warpwise + T max1 = cg::reduce(tile, largest, cg::greater()); + + T max2 = max1; + bool equal_to_max1 = (max1 == largest); + + int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1)); + + if (count_max1 == 1) { + largest = (largest == max1) ? second_largest : largest; + max2 = cg::reduce(tile, largest, cg::greater()); + } + + if (lane_id == 0) { + *output = max1 + max2; + } +} + +template +__global__ void topk_with_k2_kernel(T* output, T* input, + int64_t const num_tokens, + int64_t const num_cases, + int64_t const n_group, + int64_t const num_experts_per_group) { + int32_t warp_id = threadIdx.x / WARP_SIZE; + int32_t lane_id = threadIdx.x % WARP_SIZE; + + int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id; + if (case_id < num_cases) { + input += case_id * num_experts_per_group; + output += case_id; + + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block); + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.wait;"); +#endif + topk_with_k2(output, input, tile, lane_id, num_experts_per_group); + } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.launch_dependents;"); +#endif +} + +template +__global__ void group_idx_and_topk_idx_kernel( + T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices, + T* scores_with_bias, int64_t const num_tokens, int64_t const n_group, + int64_t const topk_group, int64_t const topk, int64_t const num_experts, + int64_t const num_experts_per_group, bool renormalize, + double routed_scaling_factor) { + int32_t warp_id = threadIdx.x / WARP_SIZE; + int32_t lane_id = threadIdx.x % WARP_SIZE; + int32_t case_id = + blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id; // one per token + scores_with_bias += case_id * num_experts; + scores += case_id * num_experts; + group_scores += case_id * n_group; + topk_values += case_id * topk; + topk_indices += case_id * topk; + + int32_t align_num_experts_per_group = + warp_topk::round_up_to_multiple_of(num_experts_per_group); + + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block); + + extern __shared__ char smem_buf[]; // NOTE: reuse the shared memory here to + // store the target topk idx + int32_t* s_topk_idx = reinterpret_cast(smem_buf); + T* s_topk_value = + reinterpret_cast(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) + + warp_id * topk; + s_topk_idx += warp_id * topk; + + T value = neg_inf(); + T topk_group_value = neg_inf(); + int32_t num_equalto_topkth_group; + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.wait;"); // I think all prolog can be put before + // acqbulk because it's ptr arithmetic +#endif + + if (case_id < num_tokens) { + // calculate group_idx + int32_t target_num_min = WARP_SIZE - n_group + topk_group; + // The check is necessary to avoid abnormal input + if (lane_id < n_group && is_finite(group_scores[lane_id])) { + value = group_scores[lane_id]; + } + + int count_equal_to_top_value = WARP_SIZE - n_group; + int pre_count_equal_to_top_value = 0; + // Use loop to find the largset top_group + while (count_equal_to_top_value < target_num_min) { + __syncwarp(); // Ensure all threads have valid data before reduction + topk_group_value = cg::reduce(tile, value, cg::greater()); + if (value == topk_group_value) { + value = neg_inf(); + } + pre_count_equal_to_top_value = count_equal_to_top_value; + count_equal_to_top_value = + __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf()))); + } + num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value; + } + __syncthreads(); + + warp_topk::WarpSelect + queue((int32_t)topk, neg_inf()); + + int count_equalto_topkth_group = 0; + bool if_proceed_next_topk = topk_group_value != neg_inf(); + if (case_id < num_tokens && if_proceed_next_topk) { + for (int i_group = 0; i_group < n_group; i_group++) { + if ((group_scores[i_group] > topk_group_value) || + ((group_scores[i_group] == topk_group_value) && + (count_equalto_topkth_group < num_equalto_topkth_group))) { + int32_t offset = i_group * num_experts_per_group; + for (int32_t i = lane_id; i < align_num_experts_per_group; + i += WARP_SIZE) { + T candidates = (i < num_experts_per_group) && + is_finite(scores_with_bias[offset + i]) + ? scores_with_bias[offset + i] + : neg_inf(); + queue.add(candidates, offset + i); + } + if (group_scores[i_group] == topk_group_value) { + count_equalto_topkth_group++; + } + } + } + queue.done(); + __syncwarp(); + // Get the topk_idx + queue.dumpIdx(s_topk_idx); + __syncwarp(); + } + + // Load the valid score value + // Calculate the summation + float topk_sum = 1e-20; + if (case_id < num_tokens && if_proceed_next_topk) { + for (int i = lane_id; + i < warp_topk::round_up_to_multiple_of(topk); + i += WARP_SIZE) { + T value = + i < topk + ? scores[s_topk_idx[i]] + : cuda_cast(0.0f); // Load the valid value of expert + if (i < topk) { + s_topk_value[i] = value; + } + topk_sum += + cg::reduce(tile, cuda_cast(value), cg::plus()); + } + } + + __syncthreads(); + + if (case_id < num_tokens) { + if (if_proceed_next_topk) { + for (int i = lane_id; i < topk; i += WARP_SIZE) { + float value; + if (renormalize) { + value = cuda_cast(s_topk_value[i]) / topk_sum * + routed_scaling_factor; + } else { + value = cuda_cast(s_topk_value[i]) * routed_scaling_factor; + } + topk_indices[i] = s_topk_idx[i]; + topk_values[i] = cuda_cast(value); + } + } else { + for (int i = lane_id; i < topk; i += WARP_SIZE) { + topk_indices[i] = i; + topk_values[i] = cuda_cast(1.0f / topk); + } + } + // Note: when if_proceed_next_topk==false, choose the first 8 experts as the + // default result. + } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + asm volatile("griddepcontrol.launch_dependents;"); +#endif +} + +template +void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values, + IdxT* topk_indices, T* scores_with_bias, + int64_t const num_tokens, int64_t const num_experts, + int64_t const n_group, int64_t const topk_group, + int64_t const topk, bool const renormalize, + double const routed_scaling_factor, bool enable_pdl = false, + cudaStream_t const stream = 0) { + int64_t num_cases = num_tokens * n_group; + int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1; + auto* kernel_instance1 = &topk_with_k2_kernel; + cudaLaunchConfig_t config; + config.gridDim = topk_with_k2_num_blocks; + config.blockDim = BLOCK_SIZE; + config.dynamicSmemBytes = 0; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; + config.numAttrs = 1; + config.attrs = attrs; + cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias, + num_tokens, num_cases, n_group, num_experts / n_group); + + int64_t topk_with_k_group_num_blocks = + (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1; + size_t dynamic_smem_in_bytes = + warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK, + topk); + auto* kernel_instance2 = &group_idx_and_topk_idx_kernel; + config.gridDim = topk_with_k_group_num_blocks; + config.blockDim = BLOCK_SIZE; + config.dynamicSmemBytes = dynamic_smem_in_bytes; + config.stream = stream; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; + config.numAttrs = 1; + config.attrs = attrs; + cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores, + topk_values, topk_indices, scores_with_bias, num_tokens, + n_group, topk_group, topk, num_experts, + num_experts / n_group, renormalize, routed_scaling_factor); +} + +#define INSTANTIATE_NOAUX_TC(T, IdxT) \ + template void invokeNoAuxTc( \ + T * scores, T * group_scores, T * topk_values, IdxT * topk_indices, \ + T * scores_with_bias, int64_t const num_tokens, \ + int64_t const num_experts, int64_t const n_group, \ + int64_t const topk_group, int64_t const topk, bool const renormalize, \ + double const routed_scaling_factor, bool enable_pdl, \ + cudaStream_t const stream); + +INSTANTIATE_NOAUX_TC(float, int32_t); +INSTANTIATE_NOAUX_TC(half, int32_t); +INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t); +} // end namespace moe +} // namespace vllm + +std::tuple grouped_topk( + torch::Tensor const& scores, torch::Tensor const& scores_with_bias, + int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize, + double routed_scaling_factor) { + auto data_type = scores_with_bias.scalar_type(); + auto input_size = scores_with_bias.sizes(); + int64_t num_tokens = input_size[0]; + int64_t num_experts = input_size[1]; + TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor"); + TORCH_CHECK(num_experts % n_group == 0, + "num_experts should be divisible by n_group"); + TORCH_CHECK(n_group <= 32, + "n_group should be smaller than or equal to 32 for now"); + TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now"); + + torch::Tensor group_scores = torch::empty( + {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA)); + torch::Tensor topk_values = torch::empty( + {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA)); + torch::Tensor topk_indices = torch::empty( + {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA)); + + auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device()); + + switch (data_type) { + case torch::kFloat16: + // Handle Float16 + vllm::moe::invokeNoAuxTc( + reinterpret_cast(scores.mutable_data_ptr()), + reinterpret_cast(group_scores.mutable_data_ptr()), + reinterpret_cast(topk_values.mutable_data_ptr()), + reinterpret_cast(topk_indices.mutable_data_ptr()), + reinterpret_cast(scores_with_bias.data_ptr()), num_tokens, + num_experts, n_group, topk_group, topk, renormalize, + routed_scaling_factor, false, stream); + break; + case torch::kFloat32: + // Handle Float32 + vllm::moe::invokeNoAuxTc( + reinterpret_cast(scores.mutable_data_ptr()), + reinterpret_cast(group_scores.mutable_data_ptr()), + reinterpret_cast(topk_values.mutable_data_ptr()), + reinterpret_cast(topk_indices.mutable_data_ptr()), + reinterpret_cast(scores_with_bias.data_ptr()), num_tokens, + num_experts, n_group, topk_group, topk, renormalize, + routed_scaling_factor, false, stream); + break; + case torch::kBFloat16: + // Handle BFloat16 + vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>( + reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()), + reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()), + reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()), + reinterpret_cast(topk_indices.mutable_data_ptr()), + reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()), + num_tokens, num_experts, n_group, topk_group, topk, renormalize, + routed_scaling_factor, false, stream); + break; + default: + // Handle other data types + throw std::invalid_argument( + "Invalid dtype, only supports float16, float32, and bfloat16"); + break; + } + return {topk_values, topk_indices}; +} diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py index 49f33718a21e..be5b68cc53e6 100644 --- a/csrc/moe/marlin_moe_wna16/generate_kernels.py +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -17,24 +17,30 @@ namespace MARLIN_NAMESPACE_NAME { """.strip() -TEMPLATE = ("template __global__ void Marlin<" - "{{scalar_t}}, " - "{{w_type_id}}, " - "{{threads}}, " - "{{thread_m_blocks}}, " - "{{thread_n_blocks}}, " - "{{thread_k_blocks}}, " - "{{'true' if m_block_size_8 else 'false'}}, " - "{{stages}}, " - "{{group_blocks}}, " - "{{'true' if is_zp_float else 'false'}}>" - "( MARLIN_KERNEL_PARAMS );") +TEMPLATE = ( + "template __global__ void Marlin<" + "{{scalar_t}}, " + "{{w_type_id}}, " + "{{s_type_id}}, " + "{{threads}}, " + "{{thread_m_blocks}}, " + "{{thread_n_blocks}}, " + "{{thread_k_blocks}}, " + "{{'true' if m_block_size_8 else 'false'}}, " + "{{stages}}, " + "{{group_blocks}}, " + "{{'true' if is_zp_float else 'false'}}>" + "( MARLIN_KERNEL_PARAMS );" +) # int8 with zero point case (vllm::kU8) is also supported, # we don't add it to reduce wheel size. SCALAR_TYPES = [ - "vllm::kU4", "vllm::kU4B8", "vllm::kU8B128", "vllm::kFE4M3fn", - "vllm::kFE2M1f" + "vllm::kU4", + "vllm::kU4B8", + "vllm::kU8B128", + "vllm::kFE4M3fn", + "vllm::kFE2M1f", ] THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)] @@ -57,11 +63,12 @@ def generate_new_kernels(): all_template_str_list = [] for group_blocks, m_blocks, thread_configs in itertools.product( - GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS): - + GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS + ): # act order case only support gptq-int4 and gptq-int8 if group_blocks == 0 and scalar_type not in [ - "vllm::kU4B8", "vllm::kU8B128" + "vllm::kU4B8", + "vllm::kU8B128", ]: continue if thread_configs[2] == 256: @@ -77,6 +84,7 @@ def generate_new_kernels(): if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]: continue # nvfp4 only supports group_size == 16 + # mxfp4 only supports group_size == 32 if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]: continue # other quantization methods don't support group_size = 16 @@ -89,9 +97,22 @@ def generate_new_kernels(): c_dtype = "half" if dtype == "fp16" else "nv_bfloat16" + if scalar_type == "vllm::kFE2M1f" and group_blocks == 1: + s_type = "vllm::kFE4M3fn" + elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2: + s_type = "vllm::kFE8M0fnu" + if dtype == "fp16": + # we cannot safely dequantize e8m0 to fp16, so skip this + continue + elif dtype == "fp16": + s_type = "vllm::kFloat16" + elif dtype == "bf16": + s_type = "vllm::kBFloat16" + template_str = jinja2.Template(TEMPLATE).render( scalar_t=c_dtype, w_type_id=scalar_type + ".id()", + s_type_id=s_type + ".id()", threads=threads, thread_m_blocks=max(m_blocks, 1), thread_n_blocks=n_blocks, diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h index 537282aba8c8..6190f7ee21ec 100644 --- a/csrc/moe/marlin_moe_wna16/kernel.h +++ b/csrc/moe/marlin_moe_wna16/kernel.h @@ -7,23 +7,25 @@ #include "quantization/gptq_marlin/marlin_dtypes.cuh" #include "core/scalar_type.hpp" -#define MARLIN_KERNEL_PARAMS \ - const int4 *__restrict__ A, const int4 *__restrict__ B, \ - int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ - const int4 *__restrict__ scales_ptr, \ - const uint16_t *__restrict__ scale2_ptr, \ - const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \ - const int32_t *__restrict__ sorted_token_ids_ptr, \ - const int32_t *__restrict__ expert_ids_ptr, \ - const int32_t *__restrict__ num_tokens_past_padded_ptr, \ - const float *__restrict__ topk_weights_ptr, int top_k, \ - bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \ - int prob_n, int prob_k, int *locks, bool use_atomic_add, \ +#define MARLIN_KERNEL_PARAMS \ + const int4 *__restrict__ A, const int4 *__restrict__ B, \ + int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ + const int4 *__restrict__ b_bias_ptr, \ + const int4 *__restrict__ scales_ptr, \ + const uint16_t *__restrict__ scale2_ptr, \ + const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \ + const int32_t *__restrict__ sorted_token_ids_ptr, \ + const int32_t *__restrict__ expert_ids_ptr, \ + const int32_t *__restrict__ num_tokens_past_padded_ptr, \ + const float *__restrict__ topk_weights_ptr, int top_k, \ + bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \ + int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \ bool use_fp32_reduce, int max_shared_mem namespace MARLIN_NAMESPACE_NAME { template ::value) { + static_assert(s_type == vllm::kBFloat16); + } else if constexpr (std::is_same::value) { + static_assert(s_type == vllm::kFloat16); + } + constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8; constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 || w_type == vllm::kU4B8 || w_type == vllm::kU8B128; // see comments of dequant.h for more details constexpr bool dequant_skip_flop = - !is_int_type || + w_type == vllm::kFE4M3fn || + w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn || has_zp && !is_zp_float && !std::is_same::value || has_zp && !is_zp_float && !(w_type == vllm::kU8); @@ -365,6 +379,7 @@ __global__ void Marlin( const int zp_expert_stride = is_zp_float ? prob_n * prob_k / group_size / 8 : prob_n * prob_k / group_size / (pack_factor * 4); + const int b_bias_expert_stride = prob_n / 8; // parallel: num valid moe blocks int num_tokens_past_padded = num_tokens_past_padded_ptr[0]; @@ -475,7 +490,7 @@ __global__ void Marlin( for (int i = 0; i < 4; i++) { int idx = tid4 * 4 + i; idx = idx < block_num_valid_tokens ? idx : 0; - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { sh_block_topk_weights[idx] = __hmul2( global_scale, Dtype::num2num2(Dtype::float2num( topk_weights_ptr[sh_block_sorted_ids[idx]]))); @@ -513,7 +528,7 @@ __global__ void Marlin( expert_id = expert_ids_ptr[block_id]; } - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { uint16_t val = scale2_ptr[expert_id]; global_scale = Dtype::num2num2(*reinterpret_cast(&val)); } @@ -526,6 +541,9 @@ __global__ void Marlin( if constexpr (has_act_order) { g_idx += (expert_id - old_expert_id) * prob_k; } + if (has_bias) { + b_bias_ptr += (expert_id - old_expert_id) * b_bias_expert_stride; + } read_moe_block_data(block_id); }; @@ -721,7 +739,7 @@ __global__ void Marlin( s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4; - s_sh_rd = s_sh_rd * 2 + warp_row % 2; + s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2; } else if constexpr (group_blocks != -1) s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + @@ -734,6 +752,18 @@ __global__ void Marlin( s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4; + int bias_sh_rd; + if constexpr (m_block_size_8) { + bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 8; + } else { + bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) % 4; + } + + int bias_sh_wr = threadIdx.x; + int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x; + // Zero-points have the same read layout as the scales // (without column-wise case) constexpr int num_col_threads = 8; @@ -793,7 +823,19 @@ __global__ void Marlin( constexpr int sh_b_size = stages * b_sh_stage; int4* sh_b = sh_new; int4* sh_red = sh_new; - int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + + constexpr int sh_size_b_red_min = + (sh_red_size < sh_b_size ? sh_red_size : sh_b_size); + constexpr int sh_size_b_red_max = + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + constexpr int sh_bias_size = (thread_n_blocks * 16 / 8); + constexpr int sh_b_red_bias_size = + sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size) + ? sh_size_b_red_max + : (sh_size_b_red_min + sh_bias_size); + + int4* sh_bias = sh_new + sh_size_b_red_min; + int4* sh_g_idx = sh_new + sh_b_red_bias_size; int4* sh_zp = sh_g_idx + (stages * g_idx_stage); constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride) : (stages * s_sh_stage); @@ -803,9 +845,9 @@ __global__ void Marlin( static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <= stages * b_sh_stage); int4* sh_a = sh_s + sh_s_size; - constexpr int shm_size_used = - moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size + - (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + constexpr int shm_size_used = moe_block_size + + stages * (g_idx_stage + zp_sh_stage) + + sh_s_size + sh_b_red_bias_size; // all remaining shared memory is used to cache A (input) // sh_a_max_row is at least ` stages * 16 * thread_m_blocks ` @@ -816,7 +858,8 @@ __global__ void Marlin( FragA frag_a[2][thread_m_blocks]; I4 frag_b_quant[2][b_thread_vecs]; FragC frag_c[thread_m_blocks][4][2]; - FragS frag_s[2][4]; // No act-order + FragS frag_s[2][4]; // No act-order + FragS frag_bias[2][4]; FragS act_frag_s[2][4][4]; // For act-order int frag_qzp[2][num_ints_per_thread]; // Zero-points FragZP frag_zp; // Zero-points in fp16 @@ -1065,10 +1108,15 @@ __global__ void Marlin( if constexpr (w_type_id != vllm::kFE2M1f.id()) { reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride]; - } else { + } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) { reinterpret_cast(&frag_s[k % 2])[0] = reinterpret_cast( sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)]; + } else { + reinterpret_cast(&frag_s[k % 2])[0] = + reinterpret_cast( + sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) + + k % 2]; } } } @@ -1281,9 +1329,9 @@ __global__ void Marlin( int s_quant_0 = reinterpret_cast(frag_s[k2])[0]; int s_quant_1 = reinterpret_cast(frag_s[k2])[1]; - dequant_fp8_scales(s_quant_0, - reinterpret_cast(&frag_s[k2])); - dequant_fp8_scales( + dequant_fp8_scales( + s_quant_0, reinterpret_cast(&frag_s[k2])); + dequant_fp8_scales( s_quant_1, reinterpret_cast(&frag_s[k2]) + 2); } @@ -1566,7 +1614,7 @@ __global__ void Marlin( // Write out the reduce final result in the correct layout. We only actually // reshuffle matrix fragments in this step, the reduction above is performed // in fragment layout. - auto write_result = [&]() { + auto write_result = [&](bool last) { int c_gl_stride = prob_n / 8; constexpr int c_sh_stride = 2 * thread_n_blocks + 1; int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); @@ -1592,7 +1640,7 @@ __global__ void Marlin( // We first reorder in shared memory to guarantee the most efficient final // global write patterns - auto write = [&](int idx, float c0, float c1, FragS& s) { + auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) { scalar_t2 res = Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); @@ -1601,14 +1649,27 @@ __global__ void Marlin( if constexpr (!has_act_order && group_blocks == -1 && w_type.size_bits() == 4 && (has_zp && dequant_skip_flop || !has_zp)) { - res = __hmul2(res, s[0]); + scalar_t2 tmp_scale = s[0]; + if constexpr (m_block_size_8) { + tmp_scale = Dtype::num2num2( + reinterpret_cast(&s[0])[(threadIdx.x % 8) / 4]); + } + res = __hmul2(res, tmp_scale); } - if constexpr (w_type == vllm::kFE2M1f) { + if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) { if (!mul_topk_weights) { res = __hmul2(res, global_scale); } } + if (has_bias && last) { + scalar_t2 tmp_bias = b_bias[0]; + if constexpr (m_block_size_8) { + tmp_bias = Dtype::num2num2( + reinterpret_cast(&b_bias[0])[(threadIdx.x % 8) / 4]); + } + res = __hadd2(res, tmp_bias); + } if constexpr (m_block_size_8) { ((scalar_t*)sh_red)[idx] = res.x; @@ -1626,19 +1687,25 @@ __global__ void Marlin( if constexpr (m_block_size_8) { int wr = c_sh_wr + 16 * j; write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1], - frag_s[j / 2][2 * (j % 2) + 0]); + frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3], - frag_s[j / 2][2 * (j % 2) + 1]); + frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); } else { int wr = c_sh_wr + 8 * j; write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], - frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], - frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0], + frag_bias[j / 2][2 * (j % 2) + 0]); write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], - frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], - frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1], + frag_bias[j / 2][2 * (j % 2) + 1]); } } c_sh_wr += 16 * (4 * c_sh_stride); @@ -1805,6 +1872,14 @@ __global__ void Marlin( } thread_block_reduce(); + + if (has_bias && last) { + __syncthreads(); + cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd], + threadIdx.x < 16 * thread_n_blocks / 8); + cp_async_fence(); + } + if constexpr (!has_act_order && group_blocks == -1 && (has_zp && dequant_skip_flop || !has_zp)) { if (w_type.size_bits() == 8 || (last || use_atomic_add)) { @@ -1867,11 +1942,20 @@ __global__ void Marlin( } barrier_release(&locks[locks_off], last); } + + if (has_bias && last) { + cp_async_wait<0>(); + __syncthreads(); + reinterpret_cast(&frag_bias)[0] = sh_bias[bias_sh_rd]; + reinterpret_cast(&frag_bias)[1] = sh_bias[bias_sh_rd + 4]; + __syncthreads(); + } + if (use_atomic_add && slice_count > 1 && slice_idx != 0) wait_negative_and_add(&locks[locks_off]); if (last || use_atomic_add) // only the last block in a slice actually writes the result - write_result(); + write_result(last); int old_slice_row = slice_row; slice_row = 0; slice_col_par++; @@ -1904,6 +1988,7 @@ __global__ void Marlin( for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; } + bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x; // Update slice k/n for scales loading if constexpr (has_act_order) { slice_k_start = tb_k * slice_row; diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu index 2cff04f699b0..601e2aa6f991 100644 --- a/csrc/moe/marlin_moe_wna16/ops.cu +++ b/csrc/moe/marlin_moe_wna16/ops.cu @@ -51,8 +51,9 @@ __global__ void permute_cols_kernel( } // namespace marlin torch::Tensor moe_wna16_marlin_gemm( - torch::Tensor& a, std::optional const& c_or_none, - torch::Tensor& b_q_weight, torch::Tensor& b_scales, + torch::Tensor& a, std::optional c_or_none, + torch::Tensor& b_q_weight, + std::optional const& b_bias_or_none, torch::Tensor& b_scales, std::optional const& b_zeros_or_none, std::optional const& g_idx_or_none, std::optional const& perm_or_none, torch::Tensor& workspace, @@ -212,7 +213,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, // Get B size int tb_k = th_config.thread_k; int tb_n = th_config.thread_n; - int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16); + int tb_m = thread_m_blocks * 16; // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32) @@ -220,6 +221,11 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, int sh_a_size = pipe_stages * (tb_m * tb_k) * 2; int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; int sh_red_size = tb_m * (tb_n + 8) * 2; + int sh_bias_size = tb_n * 2; + int tmp_size = + (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size; + tmp_size = max(max(sh_b_size, sh_red_size), tmp_size); + int sh_s_size = get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full); @@ -234,8 +240,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8, sh_zp_size = sh_s_size / 2; } - int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size + - sh_zp_size + sh_g_idx_size + sh_block_meta_size; + int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size + + sh_g_idx_size + sh_block_meta_size; return total_size; } @@ -270,20 +276,25 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8, int cache_size = get_kernel_cache_size( th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float); - return cache_size <= max_shared_mem; + return cache_size + 512 <= max_shared_mem; } - #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ - else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ - thread_n_blocks == THREAD_N_BLOCKS && \ - thread_k_blocks == THREAD_K_BLOCKS && \ - m_block_size_8 == M_BLOCK_SIZE_8 && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ - is_zp_float == IS_ZP_FLOAT) { \ - kernel = Marlin; \ + #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ + M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ + else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + m_block_size_8 == M_BLOCK_SIZE_8 && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ + is_zp_float == IS_ZP_FLOAT) { \ + constexpr auto S_TYPE = \ + W_TYPE == vllm::kFE2M1f \ + ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu) \ + : (std::is_same::value ? vllm::kFloat16 \ + : vllm::kBFloat16); \ + kernel = Marlin; \ } // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false) @@ -335,30 +346,44 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8, _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ - \ _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) - #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + #define BIGGROUP_GET_IF(W_TYPE) \ + BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \ + BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \ + BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \ + BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128) + + #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) - #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) - #define FP4_GET_IF(W_TYPE) \ - FP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ - FP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ - FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ - FP4_GET_IF_M234(W_TYPE, 8, 4, 128) + #define NVFP4_GET_IF(W_TYPE) \ + NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ + NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ + NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ + NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128) - #define BIGGROUP_GET_IF(W_TYPE) \ - BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \ - BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \ - BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \ - BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128) + #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) + + #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) + + #define MXFP4_GET_IF(W_TYPE) \ + MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ + MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ + MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ + MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128) // We currently have 4-bit models only with group_blocks == 4 #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ @@ -408,12 +433,17 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type, COMMON_GET_IF(vllm::kU4B8) COMMON_GET_IF(vllm::kU8B128) - BIGGROUP_GET_IF(vllm::kFE4M3fn) + NVFP4_GET_IF(vllm::kFE2M1f) - FP4_GET_IF(vllm::kFE2M1f) + BIGGROUP_GET_IF(vllm::kFE4M3fn) ACT_GET_IF(vllm::kU4B8) ACT_GET_IF(vllm::kU8B128) + if (std::is_same::value) { + if (false) { + } + MXFP4_GET_IF(vllm::kFE2M1f) + } return kernel; } @@ -482,16 +512,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m, } template -void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, - void* s2, void* zp, void* g_idx, void* perm, void* a_tmp, - void* sorted_token_ids, void* expert_ids, +void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias, + void* s, void* s2, void* zp, void* g_idx, void* perm, + void* a_tmp, void* sorted_token_ids, void* expert_ids, void* num_tokens_past_padded, void* topk_weights, int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep, int prob_m, int prob_n, int prob_k, void* workspace, - vllm::ScalarType const& q_type, bool has_act_order, - bool is_k_full, bool has_zp, int num_groups, int group_size, - int dev, cudaStream_t stream, int thread_k, int thread_n, - int sms, bool use_atomic_add, bool use_fp32_reduce, + vllm::ScalarType const& q_type, bool has_bias, + bool has_act_order, bool is_k_full, bool has_zp, int num_groups, + int group_size, int dev, cudaStream_t stream, int thread_k, + int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) { int thread_m_blocks = div_ceil(moe_block_size, 16); bool m_block_size_8 = moe_block_size == 8; @@ -538,6 +568,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, const int4* B_ptr = (const int4*)B; int4* C_ptr = (int4*)C; int4* C_tmp_ptr = (int4*)C_tmp; + const int4* bias_ptr = (const int4*)b_bias; const int4* s_ptr = (const int4*)s; const uint16_t* s2_ptr = (const uint16_t*)s2; const int4* zp_ptr = (const int4*)zp; @@ -648,10 +679,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, // avoid ">>>" being formatted to "> > >" // clang-format off kernel<<>>( - A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, + A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr, topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m, - prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem); + prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem); // clang-format on } @@ -659,7 +690,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, torch::Tensor moe_wna16_marlin_gemm( torch::Tensor& a, std::optional const& c_or_none, - torch::Tensor& b_q_weight, torch::Tensor& b_scales, + torch::Tensor& b_q_weight, + std::optional const& b_bias_or_none, torch::Tensor& b_scales, std::optional const& global_scale_or_none, std::optional const& b_zeros_or_none, std::optional const& g_idx_or_none, @@ -766,7 +798,6 @@ torch::Tensor moe_wna16_marlin_gemm( num_groups = b_scales.size(1); torch::Tensor g_idx, perm, a_tmp; - ; if (g_idx_or_none.has_value() && perm_or_none.has_value()) { g_idx = g_idx_or_none.value(); perm = perm_or_none.value(); @@ -815,12 +846,24 @@ torch::Tensor moe_wna16_marlin_gemm( torch::Tensor global_scale; if (global_scale_or_none.has_value()) { global_scale = global_scale_or_none.value(); - TORCH_CHECK(b_q_type == vllm::kFE2M1f, - "global_scale can only be used for float4_e2m1f."); + TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16, + "global_scale can only be used for nvfp4 format."); } else { global_scale = torch::empty({0}, options); - TORCH_CHECK(!(b_q_type == vllm::kFE2M1f), - "the global_scale parameter must be passed for float4_e2m1f."); + TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16), + "the global_scale parameter must be passed for nvfp4 format."); + } + + bool has_bias = b_bias_or_none.has_value(); + torch::Tensor b_bias; + if (has_bias) { + b_bias = b_bias_or_none.value(); + TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU"); + TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous"); + TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n"); + TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1"); + } else { + b_bias = torch::empty({0}, options); } torch::Tensor b_zeros; @@ -832,7 +875,6 @@ torch::Tensor moe_wna16_marlin_gemm( b_zeros = torch::empty({0}, options); } bool has_zp = b_zeros.size(-1) > 0; - if (has_zp) { TORCH_CHECK( b_q_type == vllm::kU4 || b_q_type == vllm::kU8, @@ -890,41 +932,58 @@ torch::Tensor moe_wna16_marlin_gemm( if (a.scalar_type() == at::ScalarType::Half) { void* scales_ptr; if (b_q_type == vllm::kFE2M1f) { - scales_ptr = b_scales.data_ptr(); + if (group_size == 16) + scales_ptr = b_scales.data_ptr(); + else if (group_size == 32) + scales_ptr = b_scales.data_ptr(); + else + TORCH_CHECK(false, + "float4_e2m1f only supports group_size == 16 (NVFP4) ", + "and group_size == 32 (MXFP4)"); } else { scales_ptr = b_scales.data_ptr(); } MARLIN_NAMESPACE_NAME::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - c_tmp.data_ptr(), scales_ptr, global_scale.data_ptr(), - b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), - a_tmp.data_ptr(), sorted_token_ids.data_ptr(), - expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(), - topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep, - size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order, - is_k_full, has_zp, num_groups, group_size, dev, + c_tmp.data_ptr(), b_bias.data_ptr(), scales_ptr, + global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), + perm.data_ptr(), a_tmp.data_ptr(), + sorted_token_ids.data_ptr(), expert_ids.data_ptr(), + num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(), + moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k, + workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full, + has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float); } else if (a.scalar_type() == at::ScalarType::BFloat16) { void* scales_ptr; if (b_q_type == vllm::kFE2M1f) { - scales_ptr = b_scales.data_ptr(); + if (group_size == 16) + scales_ptr = b_scales.data_ptr(); + else if (group_size == 32) + scales_ptr = b_scales.data_ptr(); + else + TORCH_CHECK(false, + "float4_e2m1f only supports group_size == 16 (NVFP4) ", + "and group_size == 32 (MXFP4)"); } else { scales_ptr = b_scales.data_ptr(); } MARLIN_NAMESPACE_NAME::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), - c.data_ptr(), c_tmp.data_ptr(), scales_ptr, + c.data_ptr(), c_tmp.data_ptr(), + b_bias.data_ptr(), scales_ptr, global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(), expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k, - workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, - num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float); + workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full, + has_zp, num_groups, group_size, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, + use_atomic_add, use_fp32_reduce, is_zp_float); } else { TORCH_CHECK(false, "moe_wna16_marlin_gemm only supports bfloat16 and float16"); diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 8bbcf5a673fd..b3d0c0aa58e9 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -8,12 +8,77 @@ #include "../cuda_compat.h" #include "../dispatch_utils.h" +#include "core/math.hpp" #define CEILDIV(x, y) (((x) + (y) - 1) / (y)) namespace vllm { namespace moe { +namespace batched_moe_align_block_size { + +// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel. +static constexpr int32_t num_threads = 1024; +static constexpr int32_t num_blocks = 1; +__global__ void batched_moe_align_block_size_kernel( + int32_t const num_batches, int32_t const max_tokens_per_batch, + int32_t const block_size, int32_t const* __restrict__ batch_num_tokens, + int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids, + int32_t* __restrict__ num_tokens_post_pad) { + // TODO(varun): This is a naive implementation. Could be optimized. + + size_t const batch_id = threadIdx.x; + size_t const stride = blockDim.x * gridDim.x; + int32_t const num_blocks_per_batch = + CEILDIV(max_tokens_per_batch, block_size); + int32_t const sorted_ids_size = + num_blocks_per_batch * num_batches * block_size; + int32_t const block_ids_size = sorted_ids_size / block_size; + int32_t const SENTINEL = + num_batches * max_tokens_per_batch; // To denote invalid entries. + // Intialize sorted_ids + for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) { + sorted_ids[i] = SENTINEL; + } + // Intialize expert_ids with -1 + for (size_t i = threadIdx.x; i < block_ids_size; i += stride) { + block_ids[i] = -1; + } + + int32_t b_num_tokens = 0; + if (batch_id < num_batches) { + b_num_tokens = batch_num_tokens[batch_id]; + } + int32_t const ceil_b_num_tokens = + CEILDIV(b_num_tokens, block_size) * block_size; + + // Compute prefix sum over token counts per expert + using BlockScan = cub::BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + int cumsum_val; + BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val); + __syncthreads(); + + bool const is_last_batch = batch_id == (num_batches - 1); + if (is_last_batch) { + *num_tokens_post_pad = cumsum_val + ceil_b_num_tokens; + } + + if (batch_id < num_batches) { + int32_t const batch_offset = batch_id * max_tokens_per_batch; + for (size_t i = 0; i < b_num_tokens; ++i) { + sorted_ids[cumsum_val + i] = batch_offset + i; + } + + int32_t const block_start = cumsum_val / block_size; + int32_t const num_blocks = ceil_b_num_tokens / block_size; + for (size_t i = 0; i < num_blocks; ++i) { + block_ids[block_start + i] = batch_id; + } + } +} +} // namespace batched_moe_align_block_size + template __global__ void moe_align_block_size_kernel( const scalar_t* __restrict__ topk_ids, @@ -44,6 +109,9 @@ __global__ void moe_align_block_size_kernel( for (size_t i = tid; i < numel; i += stride) { int expert_id = topk_ids[i]; + if (expert_id >= num_experts) { + continue; + } int warp_idx = expert_id / experts_per_warp; int expert_offset = expert_id % experts_per_warp; atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1); @@ -95,12 +163,15 @@ template __global__ void count_and_sort_expert_tokens_kernel( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, - size_t numel) { + size_t numel, int32_t num_experts) { const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; const size_t stride = blockDim.x * gridDim.x; for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i]; + if (expert_id >= num_experts) { + continue; + } int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); sorted_token_ids[rank_post_pad] = i; } @@ -269,11 +340,38 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sort_kernel<<>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), - cumsum_buffer.data_ptr(), topk_ids.numel()); + cumsum_buffer.data_ptr(), topk_ids.numel(), num_experts); } }); } +void batched_moe_align_block_size(int64_t max_tokens_per_batch, + int64_t block_size, + torch::Tensor const& batch_num_tokens, + torch::Tensor sorted_ids, + torch::Tensor batch_ids, + torch::Tensor num_tokens_post_pad) { + namespace batched_kernel = vllm::moe::batched_moe_align_block_size; + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + int32_t const B = batch_num_tokens.size(0); + int32_t const num_blocks_per_batch = + round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size; + int32_t const num_blocks = num_blocks_per_batch * B; + int64_t const sorted_ids_size = num_blocks * block_size; + + TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size); + TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size); + TORCH_CHECK(num_tokens_post_pad.size(0) == 1); + TORCH_CHECK(B <= batched_kernel::num_threads); + + batched_kernel::batched_moe_align_block_size_kernel<<< + batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>( + B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr(), + sorted_ids.data_ptr(), batch_ids.data_ptr(), + num_tokens_post_pad.data_ptr()); +} + void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] torch::Tensor& output) // [num_tokens, hidden_size] { diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu new file mode 100644 index 000000000000..1d25844bd526 --- /dev/null +++ b/csrc/moe/moe_lora_align_sum_kernels.cu @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../cuda_compat.h" +#include "../dispatch_utils.h" +#include "core/math.hpp" + +namespace { + +__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, + int32_t col) { + return row * total_col + col; +} + +} // namespace + +// TODO: Refactor common parts with moe_align_sum_kernels +template +__global__ void moe_lora_align_sum_kernel( + scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping, + int64_t block_size, int num_experts, int max_loras, size_t numel, + int max_num_tokens_padded, int max_num_m_blocks, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, + int topk_num, int32_t* total_tokens_post_pad) { + const size_t tokens_per_thread = div_ceil(numel, blockDim.x); + const size_t start_idx = threadIdx.x * tokens_per_thread; + + int lora_id = blockIdx.x; + extern __shared__ int32_t shared_mem[]; + int32_t* cumsum = shared_mem; + token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1); + + // Initialize sorted_token_ids with numel + for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { + sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel; + } + + // Initialize expert_ids with -1 + for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) { + expert_ids[lora_id * max_num_m_blocks + it] = -1; + } + + // Initialize total_tokens_post_pad with 0 + if (threadIdx.x == 0) { + total_tokens_post_pad[lora_id] = 0; + } + + for (int i = 0; i < num_experts; ++i) { + tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; + } + + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int mask = token_lora_mapping[i / topk_num] == lora_id; + int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]); + tokens_cnts[idx] += mask; + } + + __syncthreads(); + + // For each expert we accumulate the token counts from the different threads. + if (threadIdx.x < num_experts) { + tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; + for (int i = 1; i <= blockDim.x; ++i) { + tokens_cnts[index(num_experts, i, threadIdx.x)] += + tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; + } + } + + __syncthreads(); + + // We accumulate the token counts of all experts in thread 0. + if (threadIdx.x == 0) { + cumsum[0] = 0; + for (int i = 1; i <= num_experts; ++i) { + cumsum[i] = cumsum[i - 1] + + div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)], + block_size) * + block_size; + } + total_tokens_post_pad[lora_id] = static_cast(cumsum[num_experts]); + } + + __syncthreads(); + + /** + * For each expert, each thread processes the tokens of the corresponding + * blocks and stores the corresponding expert_id for each block. + */ + if (threadIdx.x < num_experts) { + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] = + threadIdx.x; + } + } + + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int32_t expert_id = topk_ids[i]; + /** The cumsum[expert_id] stores the starting index of the tokens that the + * expert with expert_id needs to process, and + * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens + * processed by the expert with expert_id within the current thread's token + * shard. + */ + int32_t rank_post_pad = + tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + + cumsum[expert_id]; + + int mask = (int)token_lora_mapping[i / topk_num] == lora_id; + atomicAdd( + &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)], + (i - numel) * mask); + tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask; + } +} + +void moe_lora_align_block_size(torch::Tensor topk_ids, + torch::Tensor token_lora_mapping, + int64_t num_experts, int64_t block_size, + int64_t max_loras, + torch::Tensor sorted_token_ids, + torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad) { + const int topk_num = topk_ids.size(1); + + int max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1); + + TORCH_CHECK(block_size > 0, "block_size should be greater than 0. "); + max_num_tokens_padded = round_to_next_multiple_of( + max_num_tokens_padded, static_cast(block_size)); + int max_num_m_blocks = div_ceil(max_num_tokens_padded, block_size); + + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE, + TORCH_CHECK(num_thread <= 1024, + "num_thread must be less than 1024, " + "and fallback is not implemented yet."); + const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) + + (num_experts + 1) * sizeof(int32_t); + + if (shared_mem > device_max_shared_mem) { + TORCH_CHECK(false, + "Shared memory usage exceeds device limit, and global memory " + "fallback is not implemented yet."); + } + + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] { + dim3 blockDim(num_thread); + auto kernel = moe_lora_align_sum_kernel; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem)); + kernel<<>>( + topk_ids.data_ptr(), + token_lora_mapping.data_ptr(), block_size, num_experts, + max_loras, topk_ids.numel(), max_num_tokens_padded, + max_num_m_blocks, sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), topk_num, + num_tokens_post_pad.data_ptr()); + }); +} \ No newline at end of file diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 661730c96867..45dd1824ded4 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -4,7 +4,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, torch::Tensor& token_expert_indices, - torch::Tensor& gating_output); + torch::Tensor& gating_output, bool renormalize); void moe_sum(torch::Tensor& input, torch::Tensor& output); @@ -12,6 +12,21 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad); + +void batched_moe_align_block_size(int64_t max_tokens_per_batch, + int64_t block_size, + torch::Tensor const& expert_num_tokens, + torch::Tensor sorted_ids, + torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad); + +void moe_lora_align_block_size(torch::Tensor topk_ids, + torch::Tensor token_lora_mapping, + int64_t num_experts, int64_t block_size, + int64_t max_loras, + torch::Tensor sorted_token_ids, + torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad); #ifndef USE_ROCM torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor b_qweight, torch::Tensor b_scales, @@ -22,6 +37,11 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor num_tokens_post_pad, int64_t top_k, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_K, int64_t bit); + +std::tuple grouped_topk( + torch::Tensor const& scores, torch::Tensor const& scores_with_bias, + int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize, + double routed_scaling_factor); #endif bool moe_permute_unpermute_supported(); diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index a77471a7f207..ca0c873f49d9 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -10,32 +10,28 @@ void moe_permute( const torch::Tensor& input, // [n_token, hidden] - const torch::Tensor& topk_weights, //[n_token, topk] - torch::Tensor& topk_ids, // [n_token, topk] + const torch::Tensor& topk_ids, // [n_token, topk] const torch::Tensor& token_expert_indices, // [n_token, topk] const std::optional& expert_map, // [n_expert] int64_t n_expert, int64_t n_local_expert, int64_t topk, const std::optional& align_block_size, - torch::Tensor& - permuted_input, // [topk * n_token/align_block_size_m, hidden] + torch::Tensor& permuted_input, // [permuted_size, hidden] torch::Tensor& expert_first_token_offset, // [n_local_expert + 1] - torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] + torch::Tensor& inv_permuted_idx, // [n_token, topk] + torch::Tensor& permuted_idx, // [permute_size] torch::Tensor& m_indices) { // [align_expand_m] - TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float, - "topk_weights must be float32"); TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long, "expert_first_token_offset must be int64"); TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, "topk_ids must be int32"); TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int, "token_expert_indices must be int32"); - TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, - "src_row_id2dst_row_id_map must be int32"); + TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int, + "inv_permuted_idx must be int32"); TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, "expert_first_token_offset shape != n_local_expert+1") - TORCH_CHECK( - src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(), - "token_expert_indices shape must be same as src_row_id2dst_row_id_map"); + TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(), + "token_expert_indices shape must be same as inv_permuted_idx"); auto n_token = input.sizes()[0]; auto n_hidden = input.sizes()[1]; auto align_block_size_value = @@ -46,10 +42,9 @@ void moe_permute( auto sort_workspace = torch::empty( {sorter_size}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); + auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess auto permuted_experts_id = torch::empty_like(topk_ids); - auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map); - auto align_expert_first_token_offset = - torch::zeros_like(expert_first_token_offset); + auto sorted_row_idx = torch::empty_like(inv_permuted_idx); CubKeyValueSorter sorter{}; int64_t* valid_num_ptr = nullptr; @@ -67,66 +62,68 @@ void moe_permute( const int* expert_map_ptr = get_ptr(expert_map.value()); valid_num_ptr = get_ptr(expert_first_token_offset) + n_local_expert; - preprocessTopkIdLauncher(get_ptr(topk_ids), n_token * topk, + preprocessTopkIdLauncher(get_ptr(copy_topk_ids), n_token * topk, expert_map_ptr, n_expert, stream); } // expert sort topk expert id and scan expert id get expert_first_token_offset - sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indices), - get_ptr(permuted_experts_id), - get_ptr(dst_row_id2src_row_id_map), - get_ptr(expert_first_token_offset), n_token, - n_expert, n_local_expert, topk, sorter, - get_ptr(sort_workspace), stream); + sortAndScanExpert( + get_ptr(copy_topk_ids), get_ptr(token_expert_indices), + get_ptr(permuted_experts_id), get_ptr(sorted_row_idx), + get_ptr(expert_first_token_offset), n_token, n_expert, + n_local_expert, topk, sorter, get_ptr(sort_workspace), stream); // dispatch expandInputRowsKernelLauncher MOE_DISPATCH(input.scalar_type(), [&] { expandInputRowsKernelLauncher( get_ptr(input), get_ptr(permuted_input), - get_ptr(topk_weights), get_ptr(permuted_experts_id), - get_ptr(dst_row_id2src_row_id_map), - get_ptr(src_row_id2dst_row_id_map), + get_ptr(permuted_experts_id), get_ptr(sorted_row_idx), + get_ptr(inv_permuted_idx), get_ptr(permuted_idx), get_ptr(expert_first_token_offset), n_token, valid_num_ptr, n_hidden, topk, n_local_expert, align_block_size_value, stream); }); // get m_indices and update expert_first_token_offset with align block - getMIndices(get_ptr(expert_first_token_offset), - get_ptr(align_expert_first_token_offset), - get_ptr(m_indices), n_local_expert, align_block_size_value, - stream); + // this is only required for DeepGemm and not required for CUTLASS group gemm if (align_block_size.has_value()) { - // update align_expert_first_token_offset + auto align_expert_first_token_offset = + torch::zeros_like(expert_first_token_offset); + getMIndices(get_ptr(expert_first_token_offset), + get_ptr(align_expert_first_token_offset), + get_ptr(m_indices), n_local_expert, align_block_size_value, + stream); expert_first_token_offset.copy_(align_expert_first_token_offset); } } void moe_unpermute( - const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden] - const torch::Tensor& topk_weights, //[n_token, topk] - const torch::Tensor& topk_ids, // [n_token, topk] - const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] - const torch::Tensor& expert_first_token_offset, // [n_local_expert+1] - int64_t n_expert, int64_t n_local_expert, int64_t topk, + const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden] + const torch::Tensor& topk_weights, // [n_token, topk] + const torch::Tensor& inv_permuted_idx, // [n_token, topk] + const std::optional& + expert_first_token_offset, // [n_local_expert+1] + int64_t topk, torch::Tensor& hidden_states // [n_token, hidden] ) { - TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(), - "topk_ids shape must be same as src_row_id2dst_row_id_map"); - TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, - "topk_ids must be int32"); TORCH_CHECK( permuted_hidden_states.scalar_type() == hidden_states.scalar_type(), - "topk_ids dtype must be same as src_row_id2dst_row_id_map"); + "permuted_hidden_states dtype must be same as hidden_states"); auto n_token = hidden_states.size(0); auto n_hidden = hidden_states.size(1); auto stream = at::cuda::getCurrentCUDAStream().stream(); - const int64_t* valid_ptr = - get_ptr(expert_first_token_offset) + n_local_expert; + + int64_t const* valid_ptr = nullptr; + if (expert_first_token_offset.has_value()) { + int n_local_expert = expert_first_token_offset.value().size(0) - 1; + valid_ptr = + get_ptr(expert_first_token_offset.value()) + n_local_expert; + } + MOE_DISPATCH(hidden_states.scalar_type(), [&] { finalizeMoeRoutingKernelLauncher( get_ptr(permuted_hidden_states), get_ptr(hidden_states), get_ptr(topk_weights), - get_ptr(src_row_id2dst_row_id_map), get_ptr(topk_ids), - n_token, n_hidden, topk, valid_ptr, stream); + get_ptr(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr, + stream); }); } @@ -198,19 +195,14 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, torch::Tensor& expert_first_token_offset, torch::Tensor& src_row_id2dst_row_id_map, torch::Tensor& m_indices) { - TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); + TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0"); } -void moe_unpermute(const torch::Tensor& input, - const torch::Tensor& topk_weights, torch::Tensor& topk_ids, - const torch::Tensor& token_expert_indices, - const std::optional& expert_map, - int64_t n_expert, int64_t n_local_expert, int64_t topk, - const std::optional& align_block_size, - torch::Tensor& permuted_input, - torch::Tensor& expert_first_token_offset, - torch::Tensor& src_row_id2dst_row_id_map, - torch::Tensor& m_indices) { +void moe_unpermute( + const torch::Tensor& permuted_hidden_states, + const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx, + const std::optional& expert_first_token_offset, int64_t topk, + torch::Tensor& hidden_states) { TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0"); } @@ -227,4 +219,4 @@ bool moe_permute_unpermute_supported() { TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_permute", &moe_permute); m.impl("moe_unpermute", &moe_unpermute); -} +} \ No newline at end of file diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu index de2c153882d9..2271c1bc75b1 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu @@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset, int tidx = threadIdx.x; extern __shared__ int64_t smem_expert_first_token_offset[]; for (int i = tidx; i <= num_local_expert; i += blockDim.x) { - smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i); + smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i); } __syncthreads(); auto last_token_offset = smem_expert_first_token_offset[eidx + 1]; diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h index 43c29721cd16..108091efbefa 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h @@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows, template void expandInputRowsKernelLauncher( - T const* unpermuted_input, T* permuted_output, - const float* unpermuted_scales, int* sorted_experts, + T const* unpermuted_input, T* permuted_output, int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row, - int* expanded_source_row_to_expanded_dest_row, + int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int64_t* expert_first_token_offset, int64_t const num_rows, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int num_local_experts, const int& align_block_size, cudaStream_t stream); -// Final kernel to unpermute and scale -// This kernel unpermutes the original data, does the k-way reduction and -// performs the final skip connection. -template -__global__ void finalizeMoeRoutingKernel( - T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, - float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const orig_cols, int64_t const k, - int64_t const* num_valid_ptr); - template void finalizeMoeRoutingKernelLauncher( T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const num_rows, - int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, - cudaStream_t stream); + int64_t const num_rows, int64_t const cols, int64_t const k, + int64_t const* num_valid_ptr, cudaStream_t stream); void preprocessTopkIdLauncher(int* topk_id_ptr, int size, const int* expert_map_ptr, int num_experts, diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl index ad0d390665a0..449243b92a28 100644 --- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl @@ -2,10 +2,9 @@ template __global__ void expandInputRowsKernel( - T const* unpermuted_input, T* permuted_output, - const float* unpermuted_scales, int* sorted_experts, + T const* unpermuted_input, T* permuted_output, int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row, - int* expanded_source_row_to_expanded_dest_row, + int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int64_t* expert_first_token_offset, int64_t const num_rows, int64_t const* num_dest_rows, int64_t const cols, int64_t k, int num_local_experts, int align_block_size) { @@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel( assert(expanded_dest_row <= INT32_MAX); expanded_source_row_to_expanded_dest_row[expanded_source_row] = static_cast(expanded_dest_row); + // skip non local expert token + if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) { + permuted_idx[expanded_dest_row] = expanded_source_row; + } } if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) { @@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel( using DataElem = cutlass::Array; // Duplicate and permute rows - int64_t const source_row = expanded_source_row % num_rows; + int64_t const source_row = expanded_source_row / k; auto const* source_row_ptr = reinterpret_cast(unpermuted_input + source_row * cols); @@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel( template void expandInputRowsKernelLauncher( - T const* unpermuted_input, T* permuted_output, - const float* unpermuted_scales, int* sorted_experts, + T const* unpermuted_input, T* permuted_output, int* sorted_experts, int const* expanded_dest_row_to_expanded_source_row, - int* expanded_source_row_to_expanded_dest_row, + int* expanded_source_row_to_expanded_dest_row, int* permuted_idx, int64_t* expert_first_token_offset, int64_t const num_rows, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, int num_local_experts, const int& align_block_size, cudaStream_t stream) { @@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher( int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1); func<<>>( - unpermuted_input, permuted_output, unpermuted_scales, sorted_experts, + unpermuted_input, permuted_output, sorted_experts, expanded_dest_row_to_expanded_source_row, - expanded_source_row_to_expanded_dest_row, expert_first_token_offset, - num_rows, num_valid_tokens_ptr, cols, k, num_local_experts, - align_block_size); + expanded_source_row_to_expanded_dest_row, permuted_idx, + expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k, + num_local_experts, align_block_size); } template @@ -128,11 +130,9 @@ template __global__ void finalizeMoeRoutingKernel( T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const orig_cols, int64_t const k, - int64_t const* num_valid_ptr) { + int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) { assert(orig_cols % 4 == 0); int64_t const original_row = blockIdx.x; - int64_t const num_rows = gridDim.x; auto const offset = original_row * orig_cols; OutputType* reduced_row_ptr = reduced_unpermuted_output + offset; int64_t const num_valid = *num_valid_ptr; @@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel( ComputeElem thread_output; thread_output.fill(0); for (int k_idx = 0; k_idx < k; ++k_idx) { - int64_t const expanded_original_row = original_row + k_idx * num_rows; + int64_t const expanded_original_row = original_row * k + k_idx; int64_t const expanded_permuted_row = expanded_source_row_to_expanded_dest_row[expanded_original_row]; int64_t const k_offset = original_row * k + k_idx; float const row_scale = scales[k_offset]; - // Check after row_rescale has accumulated if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) { continue; } @@ -189,9 +188,8 @@ template void finalizeMoeRoutingKernelLauncher( T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, float const* scales, int const* expanded_source_row_to_expanded_dest_row, - int const* expert_for_source_row, int64_t const num_rows, - int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, - cudaStream_t stream) { + int64_t const num_rows, int64_t const cols, int64_t const k, + int64_t const* num_valid_ptr, cudaStream_t stream) { int64_t const blocks = num_rows; int64_t const threads = 256; bool const check_finished = num_valid_ptr != nullptr; @@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher( auto* const kernel = func_map[check_finished]; kernel<<>>( expanded_permuted_rows, reduced_unpermuted_output, scales, - expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k, - num_valid_ptr); + expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr); } diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index ea4ff67ef3e4..af6e6fcd482c 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -16,18 +16,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include #include #include "../cuda_compat.h" -#include +#include "../cub_helpers.h" #ifndef USE_ROCM - #include - #include + #include + #include #else - #include - #include + #include + #include + typedef __hip_bfloat16 __nv_bfloat16; + typedef __hip_bfloat162 __nv_bfloat162; #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -44,16 +47,27 @@ template < /// Alignment requirement in bytes int Alignment = sizeof(T) * N > -class alignas(Alignment) AlignedArray { - float data[N]; +struct alignas(Alignment) AlignedArray { + T data[N]; }; +template +__device__ __forceinline__ float toFloat(T value) { + if constexpr (std::is_same_v) { + return value; + } else if constexpr (std::is_same_v) { + return __bfloat162float(value); + } else if constexpr (std::is_same_v) { + return __half2float(value); + } +} + // ====================== Softmax things =============================== // We have our own implementation of softmax here so we can support transposing the output // in the softmax kernel when we extend this module to support expert-choice routing. -template +template __launch_bounds__(TPB) __global__ - void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols) + void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage tmpStorage; @@ -63,7 +77,6 @@ __launch_bounds__(TPB) __global__ const int thread_row_offset = blockIdx.x * num_cols; - cuda::std::plus sum; float threadData(-FLT_MAX); // Don't touch finished rows. @@ -75,10 +88,11 @@ __launch_bounds__(TPB) __global__ for (int ii = threadIdx.x; ii < num_cols; ii += TPB) { const int idx = thread_row_offset + ii; - threadData = max(static_cast(input[idx]), threadData); + const float val = toFloat(input[idx]); + threadData = max(val, threadData); } - const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max()); + const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp()); if (threadIdx.x == 0) { float_max = maxElem; @@ -90,10 +104,11 @@ __launch_bounds__(TPB) __global__ for (int ii = threadIdx.x; ii < num_cols; ii += TPB) { const int idx = thread_row_offset + ii; - threadData += exp((static_cast(input[idx]) - float_max)); + const float val = toFloat(input[idx]); + threadData += expf(val - float_max); } - const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum); + const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp()); if (threadIdx.x == 0) { @@ -104,8 +119,9 @@ __launch_bounds__(TPB) __global__ for (int ii = threadIdx.x; ii < num_cols; ii += TPB) { const int idx = thread_row_offset + ii; - const float val = exp((static_cast(input[idx]) - float_max)) * normalizing_factor; - output[idx] = val; + const float val = toFloat(input[idx]); + const float softmax_val = expf(val - float_max) * normalizing_factor; + output[idx] = softmax_val; } } @@ -119,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK( const int num_experts, const int k, const int start_expert, - const int end_expert) + const int end_expert, + const bool renormalize) { using cub_kvp = cub::KeyValuePair; @@ -134,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK( const bool row_is_active = finished ? !finished[block_row] : true; const int thread_read_offset = blockIdx.x * num_experts; + float selected_sum = 0.f; for (int k_idx = 0; k_idx < k; ++k_idx) { thread_kvp.key = 0; @@ -172,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK( indices[idx] = should_process_row ? (expert - start_expert) : num_experts; assert(indices[idx] >= 0); source_rows[idx] = k_idx * num_rows + block_row; + if (renormalize) { + selected_sum += result_kvp.value; + } } __syncthreads(); } + + // Renormalize the k weights for this row to sum to 1, if requested. + if (renormalize) { + if (threadIdx.x == 0) { + const float denom = selected_sum > 0.f ? selected_sum : 1.f; + for (int k_idx = 0; k_idx < k; ++k_idx) { + const int idx = k * block_row + k_idx; + output[idx] = output[idx] / denom; + } + } + } } // ====================== TopK softmax things =============================== @@ -187,35 +219,44 @@ __launch_bounds__(TPB) __global__ void moeTopK( It fuses the softmax, max and argmax into a single kernel. Limitations: - 1) This implementation is intended for when the number of experts is a small power of 2. + 1) This implementation is optimized for when the number of experts is a small power of 2. + Additionally it also supports when number of experts is multiple of 64 which is still + faster than the computing softmax and topK separately (only tested on CUDA yet). 2) This implementation assumes k is small, but will work for any k. */ -template -__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ - void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices, - int* source_rows, const int k, const int start_expert, const int end_expert) +template +__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__ + void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices, + int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize) { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v, + "InputType must be float, __nv_bfloat16, or __half"); + // We begin by enforcing compile time assertions and setting up compile time constants. - static_assert(VPT == (VPT & -VPT), "VPT must be power of 2"); - static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2"); static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2"); static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16"); // Number of bytes each thread pulls in per load - static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float); + static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType); static constexpr int ELTS_PER_ROW = NUM_EXPERTS; static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT; static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG; + if constexpr (std::is_same_v || std::is_same_v) { + static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0, + "ELTS_PER_LDG must be 1 or even for 16-bit conversion"); + } + // Restrictions based on previous section. static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg"); - static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp"); + static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp"); static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2"); - static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size"); + static_assert(THREADS_PER_ROW <= WARP_SIZE_PARAM, "THREADS_PER_ROW can be at most warp size"); // We have NUM_EXPERTS elements per row. We specialize for small #experts - static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT; + static constexpr int ELTS_PER_WARP = WARP_SIZE_PARAM * VPT; static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW; static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP; @@ -245,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the // row it will read. - const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW; + const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW; // Now, we compute the group each thread belong to in order to determine the first column to start loads. const int thread_group_idx = threadIdx.x % THREADS_PER_ROW; const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG; - const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread; - - // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory, - // this can support all powers of 2 up to 16. - // NOTE(woosuk): The original implementation uses CUTLASS aligned array here. - // We defined our own aligned array and use it here to avoid the dependency on CUTLASS. - using AccessType = AlignedArray; + const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread; // Finally, we pull in the data from global mem float row_chunk[VPT]; - AccessType* row_chunk_vec_ptr = reinterpret_cast(&row_chunk); - const AccessType* vec_thread_read_ptr = reinterpret_cast(thread_read_ptr); + + // NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float + if constexpr (std::is_same_v) { + using VecType = AlignedArray; + VecType* row_chunk_vec_ptr = reinterpret_cast(&row_chunk); + const VecType* vec_thread_read_ptr = reinterpret_cast(thread_read_ptr); #pragma unroll - for (int ii = 0; ii < LDG_PER_THREAD; ++ii) - { - row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW]; + for (int ii = 0; ii < LDG_PER_THREAD; ++ii) { + row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW]; + } + } else if constexpr (std::is_same_v) { + if constexpr (ELTS_PER_LDG >= 2) { + using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>; + float2* row_chunk_f2 = reinterpret_cast(row_chunk); + const VecType* vec_thread_read_ptr = reinterpret_cast(thread_read_ptr); +#pragma unroll + for (int ii = 0; ii < LDG_PER_THREAD; ++ii) { + VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW]; + int base_idx_f2 = ii * ELTS_PER_LDG / 2; +#pragma unroll + for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) { + row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2( + *reinterpret_cast(vec.data + jj * 2) + ); + } + } + } else { // ELTS_PER_LDG == 1 +#pragma unroll + for (int ii = 0; ii < LDG_PER_THREAD; ++ii) { + const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW; + row_chunk[ii] = __bfloat162float(*scalar_ptr); + } + } + } else if constexpr (std::is_same_v) { + if constexpr (ELTS_PER_LDG >= 2) { + using VecType = AlignedArray<__half, ELTS_PER_LDG>; + float2* row_chunk_f2 = reinterpret_cast(row_chunk); + const VecType* vec_thread_read_ptr = reinterpret_cast(thread_read_ptr); +#pragma unroll + for (int ii = 0; ii < LDG_PER_THREAD; ++ii) { + VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW]; + int base_idx_f2 = ii * ELTS_PER_LDG / 2; +#pragma unroll + for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) { + row_chunk_f2[base_idx_f2 + jj] = __half22float2( + *reinterpret_cast(vec.data + jj * 2) + ); + } + } + } else { // ELTS_PER_LDG == 1 +#pragma unroll + for (int ii = 0; ii < LDG_PER_THREAD; ++ii) { + const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW; + row_chunk[ii] = __half2float(*scalar_ptr); + } + } } // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just @@ -319,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ int start_col = first_elt_read_by_thread; static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW; + float selected_sum = 0.f; for (int k_idx = 0; k_idx < k; ++k_idx) { // First, each thread does the local argmax @@ -372,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ output[idx] = max_val; indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS; source_rows[idx] = k_idx * num_rows + thread_row; + if (renormalize) { + selected_sum += max_val; + } } // Finally, we clear the value in the thread with the current max if there is another iteration to run. @@ -389,50 +478,78 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ } } } + + // Renormalize the k weights for this row to sum to 1, if requested. + if (renormalize) { + if (thread_group_idx == 0) + { + const float denom = selected_sum > 0.f ? selected_sum : 1.f; + for (int k_idx = 0; k_idx < k; ++k_idx) + { + const int idx = k * thread_row + k_idx; + output[idx] = output[idx] / denom; + } + } + } } namespace detail { // Constructs some constants needed to partition the work across threads at compile time. -template +template struct TopkConstants { - static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float); - static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, ""); - static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE)); + static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType); + static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, ""); + static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM)); static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG; static constexpr int THREADS_PER_ROW = EXPERTS / VPT; - static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW; + static const int ROWS_PER_WARP = WARP_SIZE_PARAM / THREADS_PER_ROW; }; } // namespace detail -template -void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices, - int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream) +template +void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices, + int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize, + cudaStream_t stream) { - static constexpr std::size_t MAX_BYTES_PER_LDG = 16; - - static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS); - using Constants = detail::TopkConstants; + static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS); + using Constants = detail::TopkConstants; static constexpr int VPT = Constants::VPT; static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP; const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP; const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB; - dim3 block_dim(WARP_SIZE, WARPS_PER_TB); - topkGatingSoftmax<<>>( - input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert); + dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB); + topkGatingSoftmax<<>>( + input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize); } -#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \ - topkGatingSoftmaxLauncherHelper( \ - gating_output, nullptr, topk_weights, topk_indices, \ - token_expert_indices, num_tokens, topk, 0, num_experts, \ - stream); +#ifndef USE_ROCM +#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ + static_assert(WARP_SIZE == 32, \ + "Unsupported warp size. Only 32 is supported for CUDA"); \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \ + num_tokens, topk, 0, num_experts, renormalize, stream); +#else +#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ + if (WARP_SIZE == 64) { \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \ + num_tokens, topk, 0, num_experts, renormalize, stream); \ + } else if (WARP_SIZE == 32) { \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \ + num_tokens, topk, 0, num_experts, renormalize, stream); \ + } else { \ + assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \ + } +#endif -template +template void topkGatingSoftmaxKernelLauncher( - const float* gating_output, + const InputType* gating_output, float* topk_weights, IndType* topk_indices, int* token_expert_indices, @@ -440,45 +557,76 @@ void topkGatingSoftmaxKernelLauncher( const int num_tokens, const int num_experts, const int topk, + const bool renormalize, cudaStream_t stream) { static constexpr int WARPS_PER_TB = 4; + static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16; +#ifndef USE_ROCM + // for bfloat16 dtype, we need 4 bytes loading to make sure num_experts + // elements can be loaded by a warp + static constexpr int BYTES_PER_LDG_MULTIPLE_64 = + (std::is_same_v || std::is_same_v) ? 4 : 8; +#endif switch (num_experts) { case 1: - LAUNCH_SOFTMAX(1, WARPS_PER_TB); + LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 2: - LAUNCH_SOFTMAX(2, WARPS_PER_TB); + LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 4: - LAUNCH_SOFTMAX(4, WARPS_PER_TB); + LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 8: - LAUNCH_SOFTMAX(8, WARPS_PER_TB); + LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 16: - LAUNCH_SOFTMAX(16, WARPS_PER_TB); + LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 32: - LAUNCH_SOFTMAX(32, WARPS_PER_TB); + LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 64: - LAUNCH_SOFTMAX(64, WARPS_PER_TB); + LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 128: - LAUNCH_SOFTMAX(128, WARPS_PER_TB); + LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); break; case 256: - LAUNCH_SOFTMAX(256, WARPS_PER_TB); + LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); + break; + case 512: + LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); + break; + // (CUDA only) support multiples of 64 when num_experts is not power of 2. + // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts, + // alternatively we can test 4 bytes loading and enable it in future. +#ifndef USE_ROCM + case 192: + LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; + case 320: + LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; + case 384: + LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); break; + case 448: + LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; + case 576: + LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64); + break; +#endif default: { TORCH_CHECK(softmax_workspace != nullptr, - "softmax_workspace must be provided for num_experts that are not a power of 2."); + "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64."); static constexpr int TPB = 256; - moeSoftmax<<>>( + moeSoftmax<<>>( gating_output, nullptr, softmax_workspace, num_experts); moeTopK<<>>( softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices, - num_experts, topk, 0, num_experts); + num_experts, topk, 0, num_experts, renormalize); } } } @@ -486,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher( } // namespace moe } // namespace vllm + +template +void dispatch_topk_softmax_launch( + torch::Tensor& gating_output, + torch::Tensor& topk_weights, + torch::Tensor& topk_indices, + torch::Tensor& token_expert_indices, + torch::Tensor& softmax_workspace, + int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream) +{ + if (topk_indices.scalar_type() == at::ScalarType::Int) { + vllm::moe::topkGatingSoftmaxKernelLauncher( + reinterpret_cast(gating_output.data_ptr()), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, num_experts, topk, renormalize, stream); + } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) { + vllm::moe::topkGatingSoftmaxKernelLauncher( + reinterpret_cast(gating_output.data_ptr()), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, num_experts, topk, renormalize, stream); + } else { + TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long); + vllm::moe::topkGatingSoftmaxKernelLauncher( + reinterpret_cast(gating_output.data_ptr()), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, num_experts, topk, renormalize, stream); + } +} + void topk_softmax( torch::Tensor& topk_weights, // [num_tokens, topk] torch::Tensor& topk_indices, // [num_tokens, topk] torch::Tensor& token_expert_indices, // [num_tokens, topk] - torch::Tensor& gating_output) // [num_tokens, num_experts] + torch::Tensor& gating_output, // [num_tokens, num_experts] + bool renormalize) { const int num_experts = gating_output.size(-1); const auto num_tokens = gating_output.numel() / num_experts; @@ -502,45 +689,19 @@ void topk_softmax( const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options()); - - if(topk_indices.scalar_type() == at::ScalarType::Int) - { - vllm::moe::topkGatingSoftmaxKernelLauncher( - gating_output.data_ptr(), - topk_weights.data_ptr(), - topk_indices.data_ptr(), - token_expert_indices.data_ptr(), - softmax_workspace.data_ptr(), - num_tokens, - num_experts, - topk, - stream); - } - else if (topk_indices.scalar_type() == at::ScalarType::UInt32) - { - vllm::moe::topkGatingSoftmaxKernelLauncher( - gating_output.data_ptr(), - topk_weights.data_ptr(), - topk_indices.data_ptr(), - token_expert_indices.data_ptr(), - softmax_workspace.data_ptr(), - num_tokens, - num_experts, - topk, - stream); - } - else { - assert(topk_indices.scalar_type() == at::ScalarType::Int64); - vllm::moe::topkGatingSoftmaxKernelLauncher( - gating_output.data_ptr(), - topk_weights.data_ptr(), - topk_indices.data_ptr(), - token_expert_indices.data_ptr(), - softmax_workspace.data_ptr(), - num_tokens, - num_experts, - topk, - stream); + const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float); + torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options); + + if (gating_output.scalar_type() == at::ScalarType::Float) { + dispatch_topk_softmax_launch(gating_output, topk_weights, topk_indices, + token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream); + } else if (gating_output.scalar_type() == at::ScalarType::Half) { + dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, + token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream); + } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) { + dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, + token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream); + } else { + TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type()); } } diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 97df311d0440..f110683af72d 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { // Apply topk softmax to the gating outputs. m.def( "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! " - "token_expert_indices, Tensor gating_output) -> ()"); + "token_expert_indices, Tensor gating_output, bool renormalize) -> ()"); m.impl("topk_softmax", torch::kCUDA, &topk_softmax); // Calculate the result of moe by summing up the partial results @@ -22,6 +22,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " Tensor! num_tokens_post_pad) -> ()"); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); + // Aligning the number of tokens to be processed by each expert such + // that it is divisible by the block size, but for the batched case. + m.def( + "batched_moe_align_block_size(int max_tokens_per_batch," + " int block_size, Tensor expert_num_tokens," + " Tensor! sorted_token_ids," + " Tensor! experts_ids," + " Tensor! num_tokens_post_pad) -> ()"); + m.impl("batched_moe_align_block_size", torch::kCUDA, + &batched_moe_align_block_size); + + // Aligning the number of tokens to be processed by each expert such + // that it is divisible by the block size. + m.def( + "moe_lora_align_block_size(Tensor topk_ids," + " Tensor token_lora_mapping," + " int num_experts," + " int block_size, int max_loras, " + " Tensor !sorted_token_ids," + " Tensor !experts_ids," + " Tensor !num_tokens_post_pad) -> () "); + m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size); + #ifndef USE_ROCM m.def( "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, " @@ -35,7 +58,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.def( "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none," - "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? " + "Tensor! b_q_weight, Tensor? b_bias_or_none," + "Tensor! b_scales, Tensor? global_scale, Tensor? " "b_zeros_or_none," "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace," "Tensor sorted_token_ids," @@ -56,18 +80,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " -> Tensor"); m.def( - "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," + "moe_permute(Tensor input, Tensor topk_ids," "Tensor token_expert_indices, Tensor? expert_map, int n_expert," "int n_local_expert," "int topk, int? align_block_size,Tensor! permuted_input, Tensor! " - "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " - "m_indices)->()"); + "expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! " + "permuted_idx, Tensor! m_indices)->()"); m.def( "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights," - "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " - "expert_first_token_offset, int n_expert, int n_local_expert,int " - "topk, Tensor! hidden_states)->()"); + "Tensor inv_permuted_idx, Tensor? expert_first_token_offset, " + "int topk, Tensor! hidden_states)->()"); m.def("moe_permute_unpermute_supported() -> bool"); m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); @@ -78,6 +101,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "output_tensor) -> ()"); m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows); + // Apply grouped topk routing to select experts. + m.def( + "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int " + "topk_group, int topk, bool renormalize, float " + "routed_scaling_factor) -> (Tensor, Tensor)"); + m.impl("grouped_topk", torch::kCUDA, &grouped_topk); #endif } diff --git a/csrc/ops.h b/csrc/ops.h index 7f3e6b6923a3..c135a1404294 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -97,6 +97,11 @@ void apply_repetition_penalties_(torch::Tensor& logits, const torch::Tensor& output_mask, const torch::Tensor& repetition_penalties); +void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts, + const torch::Tensor& rowEnds, torch::Tensor& indices, + torch::Tensor& values, int64_t numRows, int64_t stride0, + int64_t stride1); + void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, torch::Tensor& scale, double epsilon); @@ -119,17 +124,24 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, std::optional key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox); -void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query, - std::optional key, - int64_t head_size, torch::Tensor& cos_sin_cache, - bool is_neox, int64_t rot_dim, - torch::Tensor& cos_sin_cache_offsets); - void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& scale); +#ifndef USE_ROCM +void silu_and_mul_nvfp4_quant(torch::Tensor& out, + torch::Tensor& output_block_scale, + torch::Tensor& input, + torch::Tensor& input_global_scale); +#endif +void persistent_masked_m_silu_mul_quant( + const at::Tensor& input, // (E, T, 2*H) + const at::Tensor& counts, // (E) + at::Tensor& y_q, // (E, T, H) [OUT] + at::Tensor& y_s, // (E, T, H//group_size) [OUT] + bool use_ue8m0); + void mul_and_silu(torch::Tensor& out, torch::Tensor& input); void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); @@ -138,6 +150,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, double threshold); +void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input, + double alpha = 1.702, double limit = 7.0); void gelu_new(torch::Tensor& out, torch::Tensor& input); @@ -145,22 +159,6 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input); void gelu_quick(torch::Tensor& out, torch::Tensor& input); -void advance_step_flashattn(int64_t num_seqs, int64_t num_queries, - int64_t block_size, torch::Tensor& input_tokens, - torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, - torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, - torch::Tensor& block_tables); - -void advance_step_flashinfer( - int64_t num_seqs, int64_t num_queries, int64_t block_size, - torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, torch::Tensor& block_tables, - torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr, - torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds); - void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache, @@ -170,15 +168,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope, torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor); #ifndef USE_ROCM -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias); - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes); torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors, torch::Tensor _zeros, @@ -252,6 +241,11 @@ void get_cutlass_moe_mm_data( const int64_t num_experts, const int64_t n, const int64_t k, const std::optional& blockscale_offsets); +void get_cutlass_moe_mm_problem_sizes( + const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, + const int64_t k, const std::optional& blockscale_offsets); + void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, @@ -287,6 +281,16 @@ void scaled_fp4_experts_quant( torch::Tensor const& input, torch::Tensor const& input_global_scale, torch::Tensor const& input_offset_by_experts, torch::Tensor const& output_scale_offset_by_experts); + +void per_token_group_quant_fp8(const torch::Tensor& input, + torch::Tensor& output_q, torch::Tensor& output_s, + int64_t group_size, double eps, double fp8_min, + double fp8_max, bool scale_ue8m0); + +void per_token_group_quant_int8(const torch::Tensor& input, + torch::Tensor& output_q, + torch::Tensor& output_s, int64_t group_size, + double eps, double int8_min, double int8_max); #endif void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, @@ -326,6 +330,12 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const std::optional& has_initial_state, const torch::Tensor& ssm_states, int64_t pad_slot_id); +torch::Tensor dynamic_4bit_int_moe_cpu( + torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights, + torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I, + int64_t I2, int64_t group_size, bool apply_router_weight_on_input, + int64_t activation_kind); + using fptr_t = int64_t; fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, torch::Tensor& rank_data, int64_t rank, @@ -345,6 +355,8 @@ std::tuple allocate_shared_buffer_and_handle( int64_t open_mem_handle(torch::Tensor& mem_handle); void free_shared_buffer(int64_t buffer); +torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace); + #ifdef USE_ROCM fptr_t init_custom_qr(int64_t rank, int64_t world_size, std::optional qr_max_size = std::nullopt); @@ -354,4 +366,4 @@ void qr_open_handles(fptr_t _fa, const std::vector& handles); void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, int64_t quant_level, bool cast_bf2half = false); int64_t qr_max_size(); -#endif \ No newline at end of file +#endif diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 266f2a0667a2..b5645b33b907 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -99,35 +99,6 @@ __global__ void rotary_embedding_kernel( token_idx, query_stride, key_stride, head_stride); } -template -__global__ void batched_rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or - // [num_tokens] - scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, - // head_size] or [num_tokens, num_heads, - // head_size] - scalar_t* __restrict__ key, // nullptr or - // [batch_size, seq_len, num_kv_heads, - // head_size] or [num_tokens, num_kv_heads, - // head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // - // 2] - const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] - const int rot_dim, const int64_t query_stride, const int64_t key_stride, - const int64_t head_stride, const int num_heads, const int num_kv_heads, - const int head_size) { - // Each thread block is responsible for one token. - const int token_idx = blockIdx.x; - int64_t pos = positions[token_idx]; - int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx]; - const scalar_t* cache_ptr = - cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; - - apply_rotary_embedding( - query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, - token_idx, query_stride, key_stride, head_stride); -} - } // namespace vllm void rotary_embedding( @@ -211,96 +182,3 @@ void rotary_embedding( } }); } - -/* -Batched version of rotary embedding, pack multiple LoRAs together -and process in batched manner. -*/ -void batched_rotary_embedding( - torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] - torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or - // [num_tokens, num_heads * head_size] or - // [batch_size, seq_len, num_heads, head_size] or - // [num_tokens, num_heads, head_size] - std::optional - key, // null or - // [batch_size, seq_len, num_kv_heads * head_size] or - // [num_tokens, num_kv_heads * head_size] or - // [batch_size, seq_len, num_heads, head_size] or - // [num_tokens, num_heads, head_size] - int64_t head_size, - torch::Tensor& cos_sin_cache, // [max_position, rot_dim] - bool is_neox, int64_t rot_dim, - torch::Tensor& cos_sin_cache_offsets // [num_tokens] or [batch_size] -) { - // num_tokens = batch_size * seq_len - int64_t num_tokens = cos_sin_cache_offsets.size(0); - TORCH_CHECK( - positions.size(0) == num_tokens || positions.numel() == num_tokens, - "positions must have the same num_tokens or batch_size as " - "cos_sin_cache_offsets"); - - int positions_ndim = positions.dim(); - // Make sure num_tokens dim is consistent across positions, query, and key - TORCH_CHECK( - positions_ndim == 1 || positions_ndim == 2, - "positions must have shape [num_tokens] or [batch_size, seq_len]"); - if (positions_ndim == 1) { - TORCH_CHECK(query.size(0) == positions.size(0) && - (!key.has_value() || key->size(0) == positions.size(0)), - "query, key and positions must have the same number of tokens"); - } - if (positions_ndim == 2) { - TORCH_CHECK( - query.size(0) == positions.size(0) && - (!key.has_value() || key->size(0) == positions.size(0)) && - query.size(1) == positions.size(1) && - (!key.has_value() || key->size(1) == positions.size(1)), - "query, key and positions must have the same batch_size and seq_len"); - } - - // Make sure head_size is valid for query and key - int query_hidden_size = query.numel() / num_tokens; - int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0; - TORCH_CHECK(query_hidden_size % head_size == 0); - TORCH_CHECK(key_hidden_size % head_size == 0); - - // Make sure query and key have concistent number of heads - int num_heads = query_hidden_size / head_size; - int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads; - TORCH_CHECK(num_heads % num_kv_heads == 0); - - int seq_dim_idx = positions_ndim - 1; - int64_t query_stride = query.stride(seq_dim_idx); - int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0; - // Determine head stride: for [*, heads, head_size] use stride of last dim; - // for flat [*, heads*head_size], heads blocks are contiguous of size - // head_size - int query_ndim = query.dim(); - int64_t head_stride = - (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size; - - dim3 grid(num_tokens); - dim3 block(std::min(num_heads * rot_dim / 2, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] { - if (is_neox) { - vllm::batched_rotary_embedding_kernel - <<>>( - positions.data_ptr(), query.data_ptr(), - key.has_value() ? key->data_ptr() : nullptr, - cos_sin_cache.data_ptr(), - cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, head_stride, num_heads, num_kv_heads, head_size); - } else { - vllm::batched_rotary_embedding_kernel - <<>>( - positions.data_ptr(), query.data_ptr(), - key.has_value() ? key->data_ptr() : nullptr, - cos_sin_cache.data_ptr(), - cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, head_stride, num_heads, num_kv_heads, head_size); - } - }); -} diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu deleted file mode 100644 index 3d5077d9de46..000000000000 --- a/csrc/prepare_inputs/advance_step.cu +++ /dev/null @@ -1,336 +0,0 @@ -/* - * The goal of this GPU kernel is to advance input tensors on the GPU directly - * PR: https://github.com/vllm-project/vllm/pull/6338 - * Current restrictions: - * 1. Specialized for DraftModelRunner - * 2. Supports flash_attn only - */ - -#include "advance_step.cuh" - -namespace prepare_inputs { - -// -template -__global__ void advance_step_flashattn_kernel( - int num_seqs, int num_queries, int block_size, long* input_tokens_ptr, - long const* sampled_token_ids_ptr, long* input_positions_ptr, - int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr, - int64_t const block_tables_stride) { - int const n_pad = num_seqs - num_queries; - if (n_pad && blockIdx.x == 0) { - // Handle cuda graph padding - int const offset = num_queries; - for (int i = threadIdx.x; i < n_pad; i += blockDim.x) { - input_tokens_ptr[offset + i] = 0; - input_positions_ptr[offset + i] = 0; - slot_mapping_ptr[offset + i] = -1; - } - } - - int num_query_blocks = div_ceil(num_queries, num_threads); - - if (blockIdx.x >= num_query_blocks) { - return; - } - - int cur_query_id = blockIdx.x * num_threads + threadIdx.x; - - if (cur_query_id >= num_queries) { - return; - } - - // Update input_tokens - input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id]; - - int seq_len = seq_lens_ptr[cur_query_id]; - int next_seq_len = seq_len + 1; - int next_input_pos = next_seq_len - 1; - - // Update seq_lens - seq_lens_ptr[cur_query_id] = next_seq_len; - // Update input_positions - input_positions_ptr[cur_query_id] = next_input_pos; - - int const* seq_block_tables_ptr = - block_tables_ptr + block_tables_stride * cur_query_id; - - int block_index = next_input_pos / block_size; - int block_offset = next_input_pos % block_size; - - int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset; - // Update slot_mapping - slot_mapping_ptr[cur_query_id] = slot_num; -} - -inline void verify_tensor(std::string const& name, torch::Tensor const& t, - int64_t const size_0, int64_t const size_1, - c10::ScalarType const type) { - bool size_0_cond = true; - if (size_0 != -1) { - size_0_cond = t.size(0) == size_0; - } - - bool size_1_cond = true; - if (size_1 != -1) { - size_1_cond = t.size(1) == size_1; - } - - bool is_contiguous = t.is_contiguous(); - bool same_type = t.dtype() == type; - - bool pass = size_0_cond && size_1_cond && is_contiguous && same_type; - if (!pass) { - TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(), - " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(), - " is not as expected: shape = [", size_0, ", ", size_1, - "], type = ", type); - } -} - -/// each thread processes a block per query -__global__ void advance_step_flashinfer_kernel( - int num_threads, int num_seqs, int num_queries, int block_size, - long* input_tokens_ptr, long const* sampled_token_ids_ptr, - long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr, - int const* block_tables_ptr, int64_t const block_tables_stride, - int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) { - int const n_pad = num_seqs - num_queries; - if (n_pad && blockIdx.x == 0) { - // Handle cuda graph padding - int const offset = num_queries; - for (int i = threadIdx.x; i < n_pad; i += blockDim.x) { - input_tokens_ptr[offset + i] = 0; - input_positions_ptr[offset + i] = 0; - slot_mapping_ptr[offset + i] = -1; - } - } - int num_query_blocks = div_ceil(num_queries, num_threads); - - if (blockIdx.x < num_query_blocks) { - int cur_query_id = blockIdx.x * num_threads + threadIdx.x; - - if (cur_query_id < num_queries) { - // Update input_tokens - input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id]; - - int seq_len = seq_lens_ptr[cur_query_id]; - int next_seq_len = seq_len + 1; - int next_input_pos = next_seq_len - 1; - - // Update seq_lens - seq_lens_ptr[cur_query_id] = next_seq_len; - // Update input_positions - input_positions_ptr[cur_query_id] = next_input_pos; - - int const* seq_block_tables_ptr = - block_tables_ptr + block_tables_stride * cur_query_id; - - int block_index = next_input_pos / block_size; - int block_offset = next_input_pos % block_size; - - // Update paged_kv_last_page_len - paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1; - - int slot_num = - seq_block_tables_ptr[block_index] * block_size + block_offset; - // Update slot_mapping - slot_mapping_ptr[cur_query_id] = slot_num; - block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size); - } - } -} - -__global__ void advance_step_flashinfer_indptr_kernel( - int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr, - int* block_table_bound_ptr) { - int idx = blockIdx.x * num_threads + threadIdx.x; - // Update paged_kv_indptr - if (idx == 0) { - paged_kv_indptr_ptr[idx] = 0; - } - if (idx < num_queries) { - int sum = 0; - for (int i = 0; i <= idx; ++i) { - sum += block_table_bound_ptr[i]; - } - paged_kv_indptr_ptr[idx + 1] = sum; - } -} - -__global__ void advance_step_flashinfer_indices_kernel( - int num_seqs, int num_queries, int const* block_tables_ptr, - int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr, - int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { - // note: max_num_blocks_per_seq = block_tables.stride(0) - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - // when cuda graphs are enabled, paged_kv_indptr tensor - // has to be updated for the padded queries - // tid represents a query# for paged_kv_indptr tensor - if (num_queries < tid && tid <= num_seqs) { - paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries]; - } - - // each thread processes a block_ptr in block_tables - // block_tables shape: [num_queries, max_num_blocks_per_seq] - // paged_kv_indices is flattened block_tables. - for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq); - idx += (gridDim.x * blockDim.x)) { - // block_tables-row = paged_kv_indptr[queryNum] - int queryNum = idx / max_num_blocks_per_seq; - int col = idx % max_num_blocks_per_seq; - if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) { - int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col; - int block_tables_idx = queryNum * max_num_blocks_per_seq + col; - paged_kv_indices_ptr[indices_arr_idx] = - block_tables_ptr[block_tables_idx]; - } - } -} - -void advance_step_flashattn(int num_seqs, int num_queries, int block_size, - torch::Tensor& input_tokens, // type: long - torch::Tensor& sampled_token_ids, // type: long - torch::Tensor& input_positions, // type: long - torch::Tensor& seq_lens, // type: int - torch::Tensor& slot_mapping, // type: long - torch::Tensor& block_tables) { // type: int - - if (logging) { - printf("advance_step_flashattn:\n"); - printf(" num_seqs = %d\n", num_seqs); - printf(" num_queries = %d\n", num_queries); - printf(" block_size = %d\n", block_size); - } - // Verify all tensors - verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong); - verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1, - at::kLong); - verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong); - verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt); - verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong); - verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt); - - int dev = sampled_token_ids.get_device(); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev); - - int blocks; - cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); - - advance_step_flashattn_kernel - <<>>( - num_seqs, num_queries, block_size, - reinterpret_cast(input_tokens.data_ptr()), - reinterpret_cast(sampled_token_ids.data_ptr()), - reinterpret_cast(input_positions.data_ptr()), - reinterpret_cast(seq_lens.data_ptr()), - reinterpret_cast(slot_mapping.data_ptr()), - reinterpret_cast(block_tables.data_ptr()), - block_tables.stride(0)); -} - -void advance_step_flashinfer( - int num_seqs, int num_queries, int block_size, - torch::Tensor& input_tokens, // type: long - torch::Tensor& sampled_token_ids, // type: long - torch::Tensor& input_positions, // type: long - torch::Tensor& seq_lens, // type: int - torch::Tensor& slot_mapping, // type: long - torch::Tensor& block_tables, // type: int - torch::Tensor& paged_kv_indices, // type: int - torch::Tensor& paged_kv_indptr, // type: int - torch::Tensor& paged_kv_last_page_len, // type: int - torch::Tensor& block_table_bound) { // type: int - - if (logging) { - printf("advance_step_flashinfer:\n"); - printf(" num_seqs = %d\n", num_seqs); - printf(" num_queries = %d\n", num_queries); - printf(" block_size = %d\n", block_size); - printf(" block_tables.stride(0) = %zu\n", block_tables.stride(0)); - } - // Verify all tensors - verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong); - // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1, - // at::kLong); - verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong); - verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt); - verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong); - verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt); - - verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt); - verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt); - verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1, - at::kInt); - - verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt); - - int dev = sampled_token_ids.get_device(); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev); - - int blocks; - int threads; - cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); - cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - - TORCH_CHECK((blocks * threads > num_queries), - "multi-step: not enough threads to map to num_queries = ", - num_queries, " block_tables.stride(0) = ", block_tables.stride(0), - " blocks = ", blocks, " max_threads = ", threads); - if (logging) { - printf("launching kernels with %d blocks and %d threads\n", blocks, - threads); - } - advance_step_flashinfer_kernel<<>>( - threads, num_seqs, num_queries, block_size, - reinterpret_cast(input_tokens.data_ptr()), - reinterpret_cast(sampled_token_ids.data_ptr()), - reinterpret_cast(input_positions.data_ptr()), - reinterpret_cast(seq_lens.data_ptr()), - reinterpret_cast(slot_mapping.data_ptr()), - reinterpret_cast(block_tables.data_ptr()), - block_tables.stride(0), - reinterpret_cast(paged_kv_last_page_len.data_ptr()), - reinterpret_cast(block_table_bound.data_ptr())); - - advance_step_flashinfer_indptr_kernel<<>>( - threads, num_seqs, num_queries, - reinterpret_cast(paged_kv_indptr.data_ptr()), - reinterpret_cast(block_table_bound.data_ptr())); - - advance_step_flashinfer_indices_kernel<<>>( - num_seqs, num_queries, - reinterpret_cast(block_tables.data_ptr()), - block_tables.stride(0), - reinterpret_cast(paged_kv_indices.data_ptr()), - reinterpret_cast(paged_kv_indptr.data_ptr()), - reinterpret_cast(block_table_bound.data_ptr())); -} - -} // namespace prepare_inputs - -void advance_step_flashattn(int64_t num_seqs, int64_t num_queries, - int64_t block_size, torch::Tensor& input_tokens, - torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, - torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, - torch::Tensor& block_tables) { - prepare_inputs::advance_step_flashattn( - num_seqs, num_queries, block_size, input_tokens, sampled_token_ids, - input_positions, seq_lens, slot_mapping, block_tables); -} - -void advance_step_flashinfer( - int64_t num_seqs, int64_t num_queries, int64_t block_size, - torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids, - torch::Tensor& input_positions, torch::Tensor& seq_lens, - torch::Tensor& slot_mapping, torch::Tensor& block_tables, - torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr, - torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) { - prepare_inputs::advance_step_flashinfer( - num_seqs, num_queries, block_size, input_tokens, sampled_token_ids, - input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices, - paged_kv_indptr, paged_kv_last_page_len, block_table_bound); -} diff --git a/csrc/prepare_inputs/advance_step.cuh b/csrc/prepare_inputs/advance_step.cuh deleted file mode 100644 index f21574681b1a..000000000000 --- a/csrc/prepare_inputs/advance_step.cuh +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -namespace prepare_inputs { - -static constexpr int max_threads = 256; -static constexpr bool logging = false; - -constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; } - -} // namespace prepare_inputs diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 67e9149c1379..6fcd246f63c5 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -4,11 +4,36 @@ #include #include "core/math.hpp" -#include "cuda_compat.h" +#include "../cuda_compat.h" #include "dispatch_utils.h" -#include "quantization/fp8/common.cuh" +#include "quantization/w8a8/fp8/common.cuh" +#include + +#ifndef USE_ROCM + #include + #include + #include +#else + #include + #include + #include + +typedef __hip_bfloat162 __nv_bfloat162; +typedef __hip_bfloat16 __nv_bfloat16; +typedef __hip_bfloat16_raw __nv_bfloat16_raw; + #if defined(HIP_FP8_TYPE_OCP) +typedef __hip_fp8_e4m3 __nv_fp8_e4m3; +typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3; + #else +// ROCm 6.2 fallback: only *_fnuz types exist +typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3; +typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3; + #endif +#endif + +#include "core/registration.h" namespace vllm { template @@ -87,6 +112,429 @@ __global__ void act_and_mul_quant_kernel( } } } + +__device__ __forceinline__ float silu(float x) { + return __fdividef(x, (1.f + expf(-x))); +} + +__device__ __forceinline__ float2 silu2(float2 x) { + return make_float2(silu(x.x), silu(x.y)); +} + +__device__ __forceinline__ __nv_bfloat162 silu2_v2(float2 x) { +#ifndef USE_ROCM + return make_bfloat162(__float2bfloat16_rn(silu(x.x)), + __float2bfloat16_rn(silu(x.y))); +#else + return __float22bfloat162_rn(make_float2(silu(x.x), silu(x.y))); +#endif +} + +#ifndef USE_ROCM +__device__ __forceinline__ float warp_max(float v) { + static constexpr unsigned FULL_MASK = 0xffffffffu; + for (int offset = 1; offset < WARP_SIZE; offset *= 2) { + v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, offset)); + } + return v; +} + +__device__ __forceinline__ __nv_bfloat16 warp_max(__nv_bfloat16 v) { + static constexpr unsigned FULL_MASK = 0xffffffffu; + for (int offset = 1; offset < WARP_SIZE; offset *= 2) { + v = __hmax(v, __shfl_xor_sync(FULL_MASK, v, offset)); + } + return v; +} +#endif + +template +__device__ __forceinline__ void cp_async4(T* _smem_ptr, const U* _glob_ptr) { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + auto smem_ptr = reinterpret_cast(_smem_ptr); + auto glob_ptr = reinterpret_cast(_glob_ptr); + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); +#else + _smem_ptr[0] = _glob_ptr[0]; +#endif +} + +__device__ __forceinline__ void cp_async_fence() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.commit_group;\n" ::); +#else +#endif +} + +template +__device__ __forceinline__ void cp_async_wait() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); +#else +#endif +} + +template <> +__device__ __forceinline__ void cp_async_wait<0>() { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + asm volatile("cp.async.wait_all;\n" ::); +#else +#endif +} + +__device__ __forceinline__ float clip(float v, float mmin, float mmax) { +#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800 + return fminf(mmax, fmaxf(v, mmin)); +#else +#endif +} + +__device__ __forceinline__ __nv_bfloat16 clip(__nv_bfloat16 v, + __nv_bfloat16 mmin, + __nv_bfloat16 mmax) { + return __hmin(mmax, __hmax(v, mmin)); +} + +__device__ __forceinline__ __nv_bfloat162 clip(__nv_bfloat162 v, + __nv_bfloat162 mmin, + __nv_bfloat162 mmax) { + return __hmin2(mmax, __hmax2(v, mmin)); +} + +// We use the following values for fp8 min/max: +// __nv_fp8_e4m3 = (-448, +448) +// __nv_fp8_e4m3uz = (-240.0, +240.0) +// It is currently assumed that only +template +constexpr __nv_bfloat16 get_fp8_max() { + static_assert(std::is_same_v || + std::is_same_v); + if constexpr (std::is_same_v) { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 17376}); + } else { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 17264}); + } +} + +template +constexpr __nv_bfloat16 get_fp8_min() { + static_assert(std::is_same_v || + std::is_same_v); + if constexpr (std::is_same_v) { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 50144}); + } else { + return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032}); + } +} + +template +__device__ __forceinline__ int warp_expert_search( + int idx, int n, const Idx_t* __restrict__ input, Idx_t val) { + const Idx_t* input_ptr = input + idx; + int base_offset = 0; + + for (;;) { + bool move_on = (idx < n && *input_ptr <= val); + + unsigned mask = __ballot_sync(0xffffffff, move_on); + + if (mask != 0xffffffffu) { + int last_lane = 31 - __clz(mask); + return base_offset + last_lane; + } + + input_ptr += 32; + base_offset += 32; + idx += 32; + } +} + +template +__device__ __forceinline__ void token_bounds(int32_t n_tokens, + int32_t worker_id, + int32_t& n_tokens_lower, + int32_t& n_tokens_upper) { + if (n_tokens < num_parallel_tokens && worker_id < n_tokens) { + if (worker_id >= num_parallel_tokens) return; + n_tokens_lower = worker_id; + n_tokens_upper = worker_id + 1; + } else { + int32_t chunk_size = n_tokens / num_parallel_tokens; + int32_t residual = n_tokens - chunk_size * num_parallel_tokens; + auto calc_id = [&](int32_t id) { + if (id < residual) + return min(n_tokens, id * (chunk_size + 1)); + else + return min(n_tokens, id * chunk_size + residual); + }; + n_tokens_lower = calc_id(worker_id); + n_tokens_upper = calc_id(worker_id + 1); + } +} + +template +__global__ void silu_mul_fp8_quant_deep_gemm_kernel( + const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q, + float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert, + // sizes + Idx_t E, Idx_t T, Idx_t H, + // strides (in elements) + Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e, + Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t, + Idx_t stride_ys_g, Idx_t stride_counts_e) { +#ifndef USE_ROCM + static constexpr int NUM_WARPS = THREADS / WARP_SIZE; + + static constexpr int LOAD_STAGE_SIZE = 2 * GROUP_SIZE / 8; + static constexpr int LOAD_STAGE_MOD = NUM_STAGES * LOAD_STAGE_SIZE; + + static constexpr int COMPUTE_STAGE_SIZE = 2 * GROUP_SIZE / 4; + static constexpr int COMPUTE_STAGE_MOD = COMPUTE_STAGE_SIZE * NUM_STAGES; + + extern __shared__ __align__(16) __int128_t smem_128[]; + + int* s_expert_offsets = + reinterpret_cast(smem_128 + (SMEM_SIZE_BYTES_Y / 16)); + + static constexpr __nv_bfloat16 fp8_min = get_fp8_min(); + static constexpr __nv_bfloat16 fp8_max = get_fp8_max(); + // We assign EPS with it's 16-bit unsigned counterpart to allow constexpr. + static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996}); + int tid = threadIdx.x; + int warp_id = tid >> 5; + int lane_id = tid & 0x1f; + + int running_sum{}; + if (!warp_id) { + for (int i = 0; i < E; i += WARP_SIZE) { + bool valid = (i + threadIdx.x) < E; + int value = + (valid ? tokens_per_expert[i + threadIdx.x * stride_counts_e] : 0) + + (!lane_id ? running_sum : 0); + + for (int offset = 1; offset < 32; offset *= 2) { + int n = __shfl_up_sync(0xFFFFFFFFu, value, offset); + if (lane_id >= offset) value += n; + } + + if (valid) { + s_expert_offsets[i + threadIdx.x + 1] = value; + } + + running_sum = __shfl_sync(0xFFFFFFFFu, value, WARP_SIZE - 1); + } + + if (!lane_id) { + s_expert_offsets[0] = 0; + } + } + + __syncthreads(); + + int32_t total_tokens = s_expert_offsets[E]; + + const int warp_position_yq = warp_id * (H / NUM_WARPS); + const int warp_position_scales = warp_id * (H / (GROUP_SIZE * NUM_WARPS)); + + // A single block will handle tokens_per_block tokens. + // Each block i iterates over tokens of a slice of n_tokens = + // expert_counts[i], with the size of chunk being + // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of + // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling. + + // Each warp will get space to store its hidden dim for gate and up. + __int128_t* s_hidden_load = smem_128 + warp_id * ((2 * 128 / 8) * NUM_STAGES); + __int128_t* smem_load_ptr = s_hidden_load + lane_id; + + const __nv_bfloat16 fp8_inv = __hdiv(__float2bfloat16(1.f), fp8_max); + + int32_t compute_pipeline_offset_64 = 0; + int32_t load_stage_offset{}; + const __nv_bfloat16 one_bf16 = __float2bfloat16_rn(1.f); + + __int64_t* smem_compute_ptr = reinterpret_cast<__int64_t*>(smem_128) + + warp_id * (2 * (GROUP_SIZE / 4) * NUM_STAGES) + + lane_id; + __int64_t* s_gate64_ptr = smem_compute_ptr; + __int64_t* s_up64_ptr = smem_compute_ptr + GROUP_SIZE / 4; + + int tokens_lower, tokens_upper; + + token_bounds(total_tokens, blockIdx.x, tokens_lower, + tokens_upper); + + Idx_t expert_id{}, expert_offset{}, next_expert_offset{}; + int token_id = tokens_lower; + int32_t t_load{}; + + if (token_id < tokens_upper) { + expert_id = warp_expert_search(lane_id, E, s_expert_offsets, token_id); + expert_offset = s_expert_offsets[expert_id]; + next_expert_offset = s_expert_offsets[expert_id + 1]; + } else { + // This thread block has no work to do. + return; + } + + int t_load_bound = H / (GROUP_SIZE * NUM_WARPS); + + Idx_t base_i = ((expert_id * stride_i_e) / 8) + + (token_id - expert_offset) * stride_i_t / 8; + const Idx_t gate_warp_offset = + warp_id * ((stride_i_h * H) / (8 * NUM_WARPS)) + (lane_id & 0b1111); + + const __int128_t* input_128_ptr = + reinterpret_cast(_input) + gate_warp_offset + + ((lane_id < 16) ? 0 : ((H * stride_i_h) / 8)); + __int128_t* load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i); + + auto token_offset = token_id - expert_offset; + + auto load_and_advance_y_pred = [&] { + if (t_load < t_load_bound) { + // Here we are simply continuing to load data + // from the current token. + auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset; + + // It is very important that LOAD_STAGE_SIZE is constexpr to avoid + // unnecessary ALU ops. + load_stage_offset += LOAD_STAGE_SIZE; + load_stage_offset %= LOAD_STAGE_MOD; + + cp_async4(smem_load_ptr_staged, load_ptr); + load_ptr += GROUP_SIZE / 8; + ++t_load; + } else if (token_id + 1 < tokens_upper) { + // We loaded everything from the current token, let's move on + // to the next one, and we checked that we have more tokens to load. + ++token_id; + t_load = 0; + if (token_id >= next_expert_offset) { + // We need to find the next expert. + do { + // This is a loop because it's possible + // that some experts are assigned 0 tokens. + // NOTE: We are guaranteed that there's at least + // one more token left so we don't have to check for + // expert_id bounds. + ++expert_id; + // This skips 1 memory read. + expert_offset = next_expert_offset; + next_expert_offset = s_expert_offsets[expert_id + 1]; + } while (next_expert_offset == expert_offset); + + base_i = expert_id * (stride_i_e / 8); + token_offset = 0; + load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i); + } else { + // We remain within the same expert, so just + // move by H/4 __int128_t (2 * H/8). + base_i += stride_yq_t / 4; + token_offset++; + } + + load_ptr = const_cast<__int128_t*>(input_128_ptr + base_i); + + auto smem_load_ptr_staged = smem_load_ptr + load_stage_offset; + + // It is very important that LOAD_STAGE_SIZE is constexpr to avoid + // unnecessary ALU ops. + load_stage_offset += LOAD_STAGE_SIZE; + load_stage_offset %= LOAD_STAGE_MOD; + + cp_async4(smem_load_ptr_staged, load_ptr); + load_ptr += GROUP_SIZE / 8; + ++t_load; + } + // We fence even if there is nothing to load to simplify pipelining. + cp_async_fence(); + }; + + // We need to warm-up the pipeline. + #pragma unroll + for (int i = 0; i < NUM_STAGES - 1; i++) { + load_and_advance_y_pred(); + } + + __nv_fp8x4_e4m3* y_q_base_ptr = + reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id; + auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g; + + for (auto j = tokens_lower; j < tokens_upper; j++) { + const Idx_t base_ys = expert_id * stride_ys_e; + auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t; + __nv_fp8x4_e4m3* y_q_ptr = + y_q_base_ptr + (expert_id * stride_yq_e + token_offset * stride_yq_t + + warp_position_yq * stride_yq_h) / + 4; + const int COMPUTE_LIMIT = H / (GROUP_SIZE * NUM_WARPS); + + for (int i = 0; i < COMPUTE_LIMIT; i++) { + cp_async_wait(); + __syncthreads(); + load_and_advance_y_pred(); + + __int64_t* gate64_ptr = s_gate64_ptr + compute_pipeline_offset_64; + __int64_t* up64_ptr = s_up64_ptr + compute_pipeline_offset_64; + + // COMPUTE_STAGE_SIZE/MOD must also be constexpr! + compute_pipeline_offset_64 += COMPUTE_STAGE_SIZE; + compute_pipeline_offset_64 %= COMPUTE_STAGE_MOD; + + __int64_t gate64 = *gate64_ptr; + __int64_t up64 = *up64_ptr; + + // Compute + __nv_bfloat162 res[2]; + __nv_bfloat162* s_up_comp = reinterpret_cast<__nv_bfloat162*>(&up64); + __nv_bfloat162* s_gate_comp = reinterpret_cast<__nv_bfloat162*>(&gate64); + + #pragma unroll + for (int32_t k = 0; k < 2; ++k) { + __nv_bfloat162 gate = silu2_v2(__bfloat1622float2(s_gate_comp[k])); + res[k] = __hmul2(gate, s_up_comp[k]); + } + + auto _y_max2 = __hmax2(__habs2(res[0]), __habs2(res[1])); + + _y_max2.x = __hmax(__hmax(_y_max2.x, _y_max2.y), EPS); + + __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv); + + if constexpr (USE_UE8M0) { + y_s = hexp2(hceil(hlog2(y_s))); + } + + __nv_bfloat16 inv_y = __hdiv(one_bf16, y_s); + + auto y_s2 = make_bfloat162(inv_y, inv_y); + + #pragma unroll + for (int32_t k = 0; k < 2; ++k) { + res[k] = clip(__hmul2(res[k], y_s2), __bfloat162bfloat162(fp8_min), + __bfloat162bfloat162(fp8_max)); + } + + *y_q_ptr = __nv_fp8x4_e4m3(res[0], res[1]); + y_q_ptr += WARP_SIZE * stride_yq_h; + + if (!lane_id) { + *y_s_ptr = y_s; + y_s_ptr += stride_ys_g; + } + } + } +#endif +} + } // namespace vllm // Launch activation, gating, and quantize kernel. @@ -119,3 +567,86 @@ void silu_and_mul_quant(torch::Tensor& out, // [..., d] TORCH_CHECK(input.size(-1) % 2 == 0); LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); } + +void persistent_masked_m_silu_mul_quant( + const at::Tensor& input, // (E, T, 2*H) + const at::Tensor& tokens_per_expert, // (E) + at::Tensor& y_q, // (E, T, H) [OUT] + at::Tensor& y_s, // (E, T, H//group_size) [OUT] + bool use_ue8m0) { +#ifndef USE_ROCM + + // This kernel currently only supports H % 128 == 0 and assumes a + // fixed GROUP_SIZE of 128. + TORCH_CHECK(input.dtype() == torch::kBFloat16); + TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn || + y_q.dtype() == torch::kFloat8_e4m3fnuz); + TORCH_CHECK(y_s.dtype() == torch::kFloat32); + TORCH_CHECK(input.size(-1) % 256 == 0); + + using Idx_t = int64_t; + + Idx_t E = input.size(0); + Idx_t T = input.size(1); + Idx_t H = input.size(2) / 2; + Idx_t stride_i_e = input.stride(0); + Idx_t stride_i_t = input.stride(1); + Idx_t stride_i_h = input.stride(2); + Idx_t stride_yq_e = y_q.stride(0); + Idx_t stride_yq_t = y_q.stride(1); + Idx_t stride_yq_h = y_q.stride(2); + Idx_t stride_ys_e = y_s.stride(0); + Idx_t stride_ys_t = y_s.stride(1); + Idx_t stride_ys_g = y_s.stride(2); + + Idx_t stride_counts_e = tokens_per_expert.stride(0); + + static constexpr int GROUP_SIZE = 128; + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES) \ + static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE; \ + int sms = SILU_V2_BLOCK_COUNT; \ + static constexpr int max_shared_mem_bytes = \ + GROUP_SIZE * 2 * STAGES * NUM_WARPS * 2; \ + dim3 grid(sms), block(THREAD_COUNT); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + VLLM_DISPATCH_FP8_TYPES( \ + y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] { \ + vllm::silu_mul_fp8_quant_deep_gemm_kernel< \ + BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t, \ + USE_UE8M0, GROUP_SIZE, STAGES> \ + <<>>( \ + reinterpret_cast<__nv_bfloat16*>(input.data_ptr()), \ + (fp8_t*)y_q.data_ptr(), y_s.data_ptr(), \ + reinterpret_cast(tokens_per_expert.data_ptr()), E, \ + T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e, \ + stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t, \ + stride_ys_g, stride_counts_e); \ + }); + + static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32; + + if (!use_ue8m0) { + if (H >= 4096) { + static constexpr int NUM_STAGES = 4; + static constexpr int THREAD_COUNT = 256; + KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES); + } else { + static constexpr int THREAD_COUNT = 32; + KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2); + } + } else { + if (H >= 4096) { + static constexpr int NUM_STAGES = 4; + static constexpr int THREAD_COUNT = 256; + KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES); + } else { + static constexpr int THREAD_COUNT = 32; + KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2); + } + } + +#endif +} diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu deleted file mode 100644 index 79cd2c610b3c..000000000000 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Modified by Neural Magic - * Adapted from https://github.com/Vahe1994/AQLM - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace vllm { -namespace aqlm { - -__global__ void Code1x16MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m, - const int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - __shared__ int4 sh_b[32 * 9]; - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - uint32_t dec[4]; - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - half2* a = reinterpret_cast(&dec); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code2x8MatVec( - const int4* __restrict__ A, const int4* __restrict__ B, - int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m, - int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. - -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - int b_gl_rd = 0; - int c_gl_wr = a_gl_rd; - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - extern __shared__ int4 sh[]; - int4* sh_b = sh; - int4* sh_code = sh_b + 32 * 9; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - float res = 0; - - int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32); - while (iters--) { - // We pad shared memory to avoid bank conflicts during reads - __syncthreads(); - for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) { - if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i]; - } - __syncthreads(); - b_gl_rd += 32 * 8; - - int b_sh_rd = 9 * (threadIdx.x % 32); - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); - half2* b = reinterpret_cast(&sh_b[b_sh_rd]); - half2 res2 = {}; -#pragma unroll - for (int j = 0; j < 4; j++) - res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2); - res += __half2float(res2.x) + __half2float(res2.y); - b_sh_rd++; - } - a_gl_rd += 32; - } - } - - if (pred) { -#pragma unroll - for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i); - if (threadIdx.x % 32 == 0) - reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res); - } -} - -__global__ void Code1x16Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long, sums to m. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - auto dec = reinterpret_cast(&chunk); - // We bypass the L1 cache to avoid massive amounts of memory streaming - // that doesn't actually help us; this brings > 2x speedup. - asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];" - : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3]) - : "l"((void*)&codebook[enc[i]])); - - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -__global__ void Code2x8Dequant( - const int4* __restrict__ A, int4* __restrict__ C, - const int4* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int a_gl_stride = prob_k / 8 / 8; - int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - bool pred = a_gl_rd < prob_m; - - if (pred) { - // advance to the correct codebook, this easy because we only multiply one - // column of the codebook. - auto codebook_size = &codebook_a_sizes.x; - while (a_gl_rd >= *codebook_size) { - codebook += codebook_stride; - ++codebook_size; - } - } - - a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32; - int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32; - int lane = threadIdx.x % 8; - - int c_gl_stride = prob_k / 8; - int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32); - c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8; - - extern __shared__ int4 sh[]; - int4* sh_code = sh; - int4* sh_code0 = sh_code; - int4* sh_code1 = sh_code + 256 * 8; - - for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) { - int4 dec = codebook[i]; -#pragma unroll - for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec; - } - __syncthreads(); - - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; - while (iters--) { - if (pred && a_gl_rd < a_gl_end) { - const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]); -#pragma unroll - for (int i = 0; i < 8; i++) { - int4 chunk; - half2* a0 = - reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]); - half2* a1 = - reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]); -#pragma unroll - for (int j = 0; j < 4; j++) - reinterpret_cast(&chunk)[j] = __hadd2(a0[j], a1[j]); - C[a_gl_rd * 8 + i] = chunk; - } - } - a_gl_rd += 32; - } -} - -inline int ceildiv(int a, int b) { return (a + b - 1) / b; } - -const int THREAD_M = 16; - -void code1x16_matvec_cuda(const void* __restrict__ A, - const void* __restrict__ B, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B, - void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, - int prob_k, const int4 codebook_a_sizes, - const int codebook_stride) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaFuncSetAttribute(Code2x8MatVec, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code2x8MatVec<<>>( - (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m, - prob_k, codebook_a_sizes, codebook_stride); -} - -void code1x16_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 codebook_a_sizes, // cumulative sizes of A spanning each - // codebook, at most 3 long. - const int codebook_stride // as int4. -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - Code1x16Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long. - codebook_stride // as int4. - ); -} - -// Dequantizes the code and codebook into weights. -void code2x8_dequant_cuda( - const void* __restrict__ A, void* __restrict__ C, - const void* __restrict__ codebook, int prob_m, int prob_k, - const int4 - codebook_a_sizes, // cumulative sizes of A spanning each codebook, at - // most 3 long, corresponds to cols. - const int codebook_stride // as int4 -) { - int sms; - cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0); - int waves = 0; - int thread_m; - do { - waves++; - thread_m = ceildiv(prob_m, waves * sms); - } while (thread_m > THREAD_M); - - int blocks = ceildiv(prob_m, thread_m); - int threads = 32 * thread_m; - int shared = 16 * (2 * 256 * 8 + 32 * 9); - cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); - - cudaFuncSetAttribute(Code2x8Dequant, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared); - Code2x8Dequant<<>>( - (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k, - codebook_a_sizes, codebook_stride); -} - -int codebook_stride(const torch::Tensor& codebooks) { - return codebooks.stride(0) * codebooks.element_size() / sizeof(int4); -} - -void code1x16_matvec( - const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C, - const torch::Tensor& codebook, - const int4 codebook_a_sizes // cumulative sizes of A spanning each - // codebook, at most 3 long. -) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - - code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - codebook_stride(codebook)); -} - -torch::Tensor code1x16_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B, - torch::Tensor& C, const torch::Tensor& codebook, - const int4 codebook_a_sizes) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); - int prob_m = C.size(0); - int prob_k = B.size(0); - code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(), - codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes, - 2 * codebook_stride(codebook)); -} - -torch::Tensor code2x8_matmat(const torch::Tensor& input, - const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const int4 codebook_a_sizes, - const std::optional& bias) { - auto input_sizes = input.sizes(); - auto out_features = codes.size(0) * codebooks.size(2); - auto flat_input = input.reshape({-1, input.size(-1)}); - auto flat_output = torch::empty( - {flat_input.size(0), out_features}, - torch::TensorOptions().dtype(input.dtype()).device(input.device())); - - for (int i = 0; i < flat_input.size(0); ++i) { - auto input_vec = flat_input.index({i}); - auto output_vec = flat_output.index({i}); - code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks, - codebook_a_sizes); - } - flat_output *= scales.flatten().unsqueeze(0); - if (bias.has_value()) { - flat_output += bias->unsqueeze(0); - } - - auto output_sizes = input_sizes.vec(); - output_sizes.pop_back(); - output_sizes.push_back(-1); - auto output = flat_output.reshape(output_sizes); - return output; -} - -// Accumulate the partition sizes. -int4 accumulate_sizes(const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes; - auto cumulative_size = &cumulative_sizes.x; - size_t i = 0; - int last = 0; - assert(codebook_partition_sizes.size() <= 4); - for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) { - *cumulative_size = codebook_partition_sizes[i] + last; - last = *cumulative_size; - } - // fill in the rest with unreachable. - for (; i < 4; ++i, ++cumulative_size) { - *cumulative_size = last * 10; - } - return cumulative_sizes; -} - -} // namespace aqlm -} // namespace vllm - -torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, - const torch::Tensor& codebooks, - const torch::Tensor& scales, - const std::vector& codebook_partition_sizes, - const std::optional& bias) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - if (nbooks == 1 && entries == (1 << 16)) { - return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - if (nbooks == 2 && entries == (1 << 8)) { - return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, - cumulative_sizes, bias); - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} - -torch::Tensor aqlm_dequant( - const torch::Tensor& codes, const torch::Tensor& codebooks, - const std::vector& codebook_partition_sizes) { - int4 cumulative_sizes = - vllm::aqlm::accumulate_sizes(codebook_partition_sizes); - - int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(); - int const entries = codebooks.size(1); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(codes)); - int rows = codes.size(1); - int cols = codes.size(0); - - auto in_features = codes.size(1) * 8; - auto out_features = codes.size(0); - - assert(out_features == std::accumulate(codebook_partition_sizes.begin(), - codebook_partition_sizes.end(), 0)); - - auto weights = torch::empty({out_features, in_features}, - torch::TensorOptions() - .dtype(codebooks.dtype()) - .device(codebooks.device())); - - if (nbooks == 1 && entries == (1 << 16)) { - vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation.) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - if (nbooks == 2 && entries == (1 << 8)) { - vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(), - codebooks.data_ptr(), out_features, - in_features, cumulative_sizes, - vllm::aqlm::codebook_stride(codebooks)); - - // if you wanted to flip to scaling the weights, (though it's 30%-ish slower - // and not consistent with gemv implementation) weights *= - // scales.index({"...", 0, 0}); - - return weights; - } - - TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, - " entries is not currently supported.") - return {}; -} diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu deleted file mode 100644 index 5cd2ac179768..000000000000 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ /dev/null @@ -1,338 +0,0 @@ -#include -#include - -#include - -#include "../../dispatch_utils.h" -#include "../vectorization_utils.cuh" - -#ifndef USE_ROCM - #include - #include -#else - #include - #include -#endif - -static inline __device__ int8_t float_to_int8_rn(float x) { -#ifdef USE_ROCM - static constexpr auto i8_min = - static_cast(std::numeric_limits::min()); - static constexpr auto i8_max = - static_cast(std::numeric_limits::max()); - - // To match the rounding mode of CUDA, we use nearbyint. - // It uses the current rounding mode, which is always FE_TONEAREST on HIP. - // If that changes in the future, we may need to set the rounding mode - // explicitly, either at runtime or compile time. - float dst = std::nearbyint(x); - - // saturate - - // See https://github.com/pytorch/pytorch/issues/127666 - // See https://github.com/llvm/llvm-project/issues/95183 - // hip-clang std::clamp __glibcxx_assert_fail host function when building on - // Arch/gcc14. The following replaces std::clamp usage with similar logic - // dst = std::clamp(dst, i8_min, i8_max); - dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst; - return static_cast(dst); -#else - // CUDA path - uint32_t dst; - asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x)); - return reinterpret_cast(dst); -#endif -} - -static inline __device__ int32_t float_to_int32_rn(float x) { -#ifdef USE_ROCM - // int32_max is not exactly representable as float. - // Therefore, we need to be careful and manually return int32_max on overflow. - // For symmetry, we also do the same for int32_min, even though it is exactly - // representable as float and the conversion should be exact. - static constexpr auto i32_min = std::numeric_limits::min(); - static constexpr auto i32_min_f = static_cast(i32_min); - static constexpr auto i32_max = std::numeric_limits::max(); - static constexpr auto i32_max_f = static_cast(i32_max); - - // To match the rounding mode of CUDA, we use nearbyint. - // It uses the current rounding mode, which is always FE_TONEAREST on HIP. - // If that changes in the future, we may need to set the rounding mode - // explicitly, either at runtime or compile time. - float dst = std::nearbyint(x); - - // saturate on the higher end. - if (dst >= i32_max_f) { - return i32_max; - } - // saturate on the lower end. - if (dst <= i32_min_f) { - return i32_min; - } - - return static_cast(dst); -#else - // CUDA path - uint32_t dst; - asm volatile("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(dst) : "f"(x)); - return reinterpret_cast(dst); -#endif -} - -static inline __device__ int8_t int32_to_int8(int32_t x) { -#ifdef USE_ROCM - static constexpr auto i8_min = - static_cast(std::numeric_limits::min()); - static constexpr auto i8_max = - static_cast(std::numeric_limits::max()); - - // saturate - - // See https://github.com/pytorch/pytorch/issues/127666 - // See https://github.com/llvm/llvm-project/issues/95183 - // hip-clang std::clamp __glibcxx_assert_fail host function when building on - // Arch/gcc14. The following replaces std::clamp usage with similar logic - // int32_t dst = std::clamp(x, i8_min, i8_max); - int32_t dst = (x < i8_min) ? i8_min : (x > i8_max) ? i8_max : x; - return static_cast(dst); -#else - // CUDA path - uint32_t dst; - asm volatile("cvt.sat.s8.s32 %0, %1;" : "=r"(dst) : "r"(x)); - return reinterpret_cast(dst); -#endif -} - -namespace vllm { - -template -__global__ void static_scaled_int8_quant_kernel( - const scalar_t* __restrict__ input, int8_t* __restrict__ output, - const scale_t* scale_ptr, const int hidden_size) { - const int tid = threadIdx.x; - const int stride = blockDim.x; - const int64_t token_idx = blockIdx.x; - const float scale = *scale_ptr; - - // Must be performed using 64-bit math to avoid integer overflow. - const scalar_t* row_in = input + token_idx * hidden_size; - int8_t* row_out = output + token_idx * hidden_size; - - vectorize_with_alignment<16>( - row_in, row_out, hidden_size, tid, stride, - [=] __device__(int8_t& dst, const scalar_t& src) { - dst = float_to_int8_rn(static_cast(src) / scale); - }); -} - -template -__global__ void static_scaled_int8_azp_quant_kernel( - const scalar_t* __restrict__ input, int8_t* __restrict__ output, - const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) { - const int tid = threadIdx.x; - const int stride = blockDim.x; - const int64_t token_idx = blockIdx.x; - const float scale = *scale_ptr; - const azp_t azp = *azp_ptr; - const float inv_s = 1.0f / scale; - - // Must be performed using 64-bit math to avoid integer overflow. - const scalar_t* row_in = input + token_idx * hidden_size; - int8_t* row_out = output + token_idx * hidden_size; - - vectorize_with_alignment<16>( - row_in, row_out, hidden_size, tid, stride, - [=] __device__(int8_t& dst, const scalar_t& src) { - const auto v = static_cast(src) * inv_s; - dst = int32_to_int8(float_to_int32_rn(v) + azp); - }); -} - -template -__global__ void dynamic_scaled_int8_quant_kernel( - const scalar_t* __restrict__ input, int8_t* __restrict__ output, - scale_t* scale_out, const int hidden_size) { - const int tid = threadIdx.x; - const int stride = blockDim.x; - const int64_t token_idx = blockIdx.x; - - // Must be performed using 64-bit math to avoid integer overflow. - const scalar_t* row_in = input + token_idx * hidden_size; - int8_t* row_out = output + token_idx * hidden_size; - - // calculate for absmax - float thread_max = 0.f; - vectorize_read_with_alignment<16>( - row_in, hidden_size, tid, stride, [&] __device__(const scalar_t& src) { - const float v = fabsf(static_cast(src)); - thread_max = fmaxf(thread_max, v); - }); - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage tmp; - float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x); - __shared__ float absmax; - if (tid == 0) { - absmax = block_max; - scale_out[blockIdx.x] = absmax / 127.f; - } - __syncthreads(); - - float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax; - - // 2. quantize - vectorize_with_alignment<16>( - row_in, row_out, hidden_size, tid, stride, - [=] __device__(int8_t& dst, const scalar_t& src) { - dst = float_to_int8_rn(static_cast(src) * inv_s); - }); -} - -// MinMax structure to hold min and max values in one go -struct MinMax { - float min, max; - - __host__ __device__ MinMax() - : min(std::numeric_limits::max()), - max(std::numeric_limits::lowest()) {} - - __host__ __device__ explicit MinMax(float v) : min(v), max(v) {} - - // add a value to the MinMax - __host__ __device__ MinMax& operator+=(float v) { - min = fminf(min, v); - max = fmaxf(max, v); - return *this; - } - - // merge two MinMax objects - __host__ __device__ MinMax& operator&=(const MinMax& other) { - min = fminf(min, other.min); - max = fmaxf(max, other.max); - return *this; - } -}; - -__host__ __device__ inline MinMax operator+(MinMax a, float v) { - return a += v; -} -__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) { - return a &= b; -} - -template -__global__ void dynamic_scaled_int8_azp_quant_kernel( - const scalar_t* __restrict__ input, int8_t* __restrict__ output, - scale_t* scale_out, azp_t* azp_out, const int hidden_size) { - const int tid = threadIdx.x; - const int stride = blockDim.x; - const int64_t token_idx = blockIdx.x; - - // Must be performed using 64-bit math to avoid integer overflow. - const scalar_t* row_in = input + token_idx * hidden_size; - int8_t* row_out = output + token_idx * hidden_size; - - // 1. calculate min & max - MinMax thread_mm; - vectorize_read_with_alignment<16>(row_in, hidden_size, tid, stride, - [&] __device__(const scalar_t& src) { - thread_mm += static_cast(src); - }); - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage tmp; - - MinMax mm = BlockReduce(tmp).Reduce( - thread_mm, - [] __device__(MinMax a, const MinMax& b) { - a &= b; - return a; - }, - blockDim.x); - - __shared__ float scale_sh; - __shared__ azp_t azp_sh; - if (tid == 0) { - float s = (mm.max - mm.min) / 255.f; - float zp = nearbyintf(-128.f - mm.min / s); // round-to-even - scale_sh = s; - azp_sh = azp_t(zp); - scale_out[blockIdx.x] = s; - azp_out[blockIdx.x] = azp_sh; - } - __syncthreads(); - - const float inv_s = 1.f / scale_sh; - const azp_t azp = azp_sh; - - // 2. quantize - vectorize_with_alignment<16>( - row_in, row_out, hidden_size, tid, stride, - [=] __device__(int8_t& dst, const scalar_t& src) { - const auto v = static_cast(src) * inv_s; - dst = int32_to_int8(float_to_int32_rn(v) + azp); - }); -} - -} // namespace vllm - -void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] - torch::Tensor const& input, // [..., hidden_size] - torch::Tensor const& scale, - std::optional const& azp) { - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scale.numel() == 1); - TORCH_CHECK(!azp || azp->numel() == 1); - - int const hidden_size = input.size(-1); - int const num_tokens = input.numel() / hidden_size; - dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 256)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "static_scaled_int8_quant_kernel", [&] { - if (!azp) { - vllm::static_scaled_int8_quant_kernel - <<>>( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), hidden_size); - } else { - vllm::static_scaled_int8_azp_quant_kernel - <<>>( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), azp->data_ptr(), - hidden_size); - } - }); -} - -void dynamic_scaled_int8_quant( - torch::Tensor& out, // [..., hidden_size] - torch::Tensor const& input, // [..., hidden_size] - torch::Tensor& scales, std::optional const& azp) { - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(scales.is_contiguous()); - TORCH_CHECK(!azp || azp->is_contiguous()); - - int const hidden_size = input.size(-1); - int const num_tokens = input.numel() / hidden_size; - dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 256)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] { - if (!azp) { - vllm::dynamic_scaled_int8_quant_kernel - <<>>( - input.data_ptr(), out.data_ptr(), - scales.data_ptr(), hidden_size); - } else { - vllm::dynamic_scaled_int8_azp_quant_kernel - <<>>( - input.data_ptr(), out.data_ptr(), - scales.data_ptr(), azp->data_ptr(), - hidden_size); - } - }); -} diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu new file mode 100644 index 000000000000..2d1568b08651 --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -0,0 +1,494 @@ +// +// Based off of: +// https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu +// + +#include +#include +#include +#include "cutlass_extensions/torch_utils.hpp" + +#include "core/registration.h" + +#include "cutlass/cutlass.h" +#include + +#include "cute/tensor.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/mixed_dtype_utils.hpp" + +#include "cutlass_extensions/common.hpp" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" + +#include + +namespace vllm::cutlass_w4a8 { + +using namespace cute; + +// ------------------------------------------------------------------------------------- +// Static configuration shared across all instantiations +// ------------------------------------------------------------------------------------- +using MmaType = cutlass::float_e4m3_t; // A/scale element type +using QuantType = cutlass::int4b_t; // B element type (packed int4) + +static int constexpr TileShapeK = 128 * 8 / sizeof_bits::value; +static int constexpr ScalePackSize = 8; // pack 8 scale elements together +static int constexpr PackFactor = 8; // 8 4-bit packed into int32 + +// A matrix configuration +using ElementA = MmaType; // Element type for A matrix operand +using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand +using LayoutA_Transpose = + typename cutlass::layout::LayoutTranspose::type; +constexpr int AlignmentA = + 128 / cutlass::sizeof_bits< + ElementA>::value; // Memory access granularity/alignment of A + // matrix in units of elements (up to 16 bytes) +using StrideA = cutlass::detail::TagToStrideA_t; + +// B matrix configuration +using ElementB = QuantType; // Element type for B matrix operand +using LayoutB = + cutlass::layout::ColumnMajor; // Layout type for B matrix operand +using LayoutB_Transpose = + typename cutlass::layout::LayoutTranspose::type; +constexpr int AlignmentB = + 128 / cutlass::sizeof_bits< + ElementB>::value; // Memory access granularity/alignment of B + // matrix in units of elements (up to 16 bytes) +using StrideB = cutlass::detail::TagToStrideB_t; + +// Define the CuTe layout for reordered quantized tensor B +// LayoutAtomQuant places values that will be read by the same thread in +// contiguous locations in global memory. It specifies the reordering within a +// single warp's fragment +using LayoutAtomQuant = + decltype(cutlass::compute_memory_reordering_atom()); +using LayoutB_Reordered = decltype(cute::tile_to_shape( + LayoutAtomQuant{}, Layout, StrideB>{})); + +// Group-wise scales +using ElementScale = MmaType; +using LayoutScale = cutlass::layout::RowMajor; + +// Per-tok, per-chan scales +using ElementSChannel = float; + +// C/D matrix configuration +using ElementC = + cutlass::bfloat16_t; // Element type for C and D matrix operands +using LayoutC = + cutlass::layout::RowMajor; // Layout type for C and D matrix operands +constexpr int AlignmentC = + 128 / cutlass::sizeof_bits< + ElementC>::value; // Memory access granularity/alignment of C + // matrix in units of elements (up to 16 bytes) + +using ElementD = ElementC; +using LayoutD = LayoutC; +constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + +// Core kernel configurations +using ElementAccumulator = float; // Element type for internal accumulation +using ElementCompute = float; // Element type for epilogue computation +using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that + // supports the intended feature +using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag +using KernelSchedule = + cutlass::gemm::KernelTmaWarpSpecializedCooperative; // Kernel to launch + // based on the default + // setting in the + // Collective Builder +using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative; +using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; + +// ---------------------------------------------------------------------------- +// Kernel template — Tile/Cluster shapes +// ---------------------------------------------------------------------------- +template +struct W4A8GemmKernel { + using TileShape = + decltype(cute::append(TileShape_MN{}, cute::Int{})); + using ClusterShape = ClusterShape_MNK; + + // Epilogue per-tok, per-chan scales + using ChTokScalesEpilogue = + typename vllm::c3x::ScaledEpilogue; + using EVTCompute = typename ChTokScalesEpilogue::EVTCompute; + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, + ElementAccumulator, ElementSChannel, + // Transpose layout of D here since we use explicit swap + transpose + // the void type for C tells the builder to allocate 0 smem for the C + // matrix. We can enable this if beta == 0 by changing ElementC to + // void below. + ElementC, typename cutlass::layout::LayoutTranspose::type, + AlignmentC, ElementD, + typename cutlass::layout::LayoutTranspose::type, AlignmentD, + EpilogueSchedule, // This is the only epi supporting the required + // swap + transpose. + EVTCompute>::CollectiveOp; + + // The Scale information must get paired with the operand that will be scaled. + // In this example, B is scaled so we make a tuple of B's information and the + // scale information. + using CollectiveMainloopShuffled = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, + cute::tuple>, + LayoutB_Reordered, AlignmentB, ElementA, LayoutA_Transpose, + AlignmentA, ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + KernelSchedule>::CollectiveOp; + + using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal< + Shape, // Indicates ProblemShape + CollectiveMainloopShuffled, CollectiveEpilogue>; + using GemmShuffled = + cutlass::gemm::device::GemmUniversalAdapter; + + using StrideC = typename GemmKernelShuffled::StrideC; + using StrideD = typename GemmKernelShuffled::StrideD; + using StrideS = typename CollectiveMainloopShuffled::StrideScale; + + static torch::Tensor mm(torch::Tensor const& A, + torch::Tensor const& B, // already packed + torch::Tensor const& group_scales, // already packed + int64_t group_size, + torch::Tensor const& channel_scales, + torch::Tensor const& token_scales, + std::optional const& maybe_out_type) { + // TODO: param validation + int m = A.size(0); + int k = A.size(1); + int n = B.size(1); + + // safely cast group_size to int + TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits::max(), + "group_size out of supported range for int: ", group_size); + int const group_size_int = static_cast(group_size); + + // Allocate output + const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); + auto device = A.device(); + auto stream = at::cuda::getCurrentCUDAStream(device.index()); + torch::Tensor D = + torch::empty({m, n}, torch::TensorOptions() + .dtype(equivalent_scalar_type_v) + .device(device)); + // prepare arg pointers + auto A_ptr = static_cast(A.const_data_ptr()); + auto B_ptr = static_cast(B.const_data_ptr()); + auto D_ptr = static_cast(D.data_ptr()); + // can we avoid hardcode the 8 here + auto S_ptr = + static_cast const*>( + group_scales.const_data_ptr()); + + // runtime layout for B + auto shape_B = cute::make_shape(n, k, 1); + LayoutB_Reordered layout_B_reordered = + cute::tile_to_shape(LayoutAtomQuant{}, shape_B); + + // strides + int const scale_k = cutlass::ceil_div(k, group_size_int); + StrideA stride_A = + cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); + // Reverse stride here due to swap and transpose + StrideD stride_D = + cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(n, m, 1)); + StrideS stride_S = cutlass::make_cute_packed_stride( + StrideS{}, cute::make_shape(n, scale_k, 1)); + + // Create a structure of gemm kernel arguments suitable for invoking an + // instance of Gemm auto arguments = + // args_from_options(options); + /// Populates a Gemm::Arguments structure from the given arguments + /// Swap the A and B tensors, as well as problem shapes here. + using Args = typename GemmShuffled::Arguments; + using MainloopArguments = typename GemmKernelShuffled::MainloopArguments; + using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments; + + MainloopArguments mainloop_arguments{ + B_ptr, layout_B_reordered, A_ptr, stride_A, + S_ptr, stride_S, group_size_int}; + + EpilogueArguments epilogue_arguments{ + ChTokScalesEpilogue::prepare_args(channel_scales, token_scales), + nullptr, + {}, // no C + D_ptr, + stride_D}; + + Args arguments{cutlass::gemm::GemmUniversalMode::kGemm, + {n, m, k, 1}, // shape + mainloop_arguments, + epilogue_arguments}; + + // Workspace + size_t workspace_size = GemmShuffled::get_workspace_size(arguments); + torch::Tensor workspace = + torch::empty(workspace_size, + torch::TensorOptions().dtype(torch::kU8).device(device)); + + // Run GEMM + GemmShuffled gemm; + CUTLASS_CHECK(gemm.can_implement(arguments)); + CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream)); + CUTLASS_CHECK(gemm.run(stream)); + + return D; + } +}; + +// ---------------------------------------------------------------------------- +// Kernel instantiations and dispatch logic +// ---------------------------------------------------------------------------- +using Kernel_256x128_1x1x1 = + W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_256x64_1x1x1 = W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_256x32_1x1x1 = W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_256x16_1x1x1 = W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_128x256_2x1x1 = + W4A8GemmKernel, Shape<_2, _1, _1>>; +using Kernel_128x256_1x1x1 = + W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_128x128_1x1x1 = + W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_128x64_1x1x1 = W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_128x32_1x1x1 = W4A8GemmKernel, Shape<_1, _1, _1>>; +using Kernel_128x16_1x1x1 = W4A8GemmKernel, Shape<_1, _1, _1>>; + +torch::Tensor mm_dispatch(torch::Tensor const& A, + torch::Tensor const& B, // already packed + torch::Tensor const& group_scales, // already packed + int64_t group_size, + torch::Tensor const& channel_scales, + torch::Tensor const& token_scales, + std::optional const& maybe_out_type, + const std::string& schedule) { + if (schedule == "256x128_1x1x1") { + return Kernel_256x128_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "256x64_1x1x1") { + return Kernel_256x64_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "256x32_1x1x1") { + return Kernel_256x32_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "256x16_1x1x1") { + return Kernel_256x16_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "128x256_2x1x1") { + return Kernel_128x256_2x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "128x256_1x1x1") { + return Kernel_128x256_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "128x128_1x1x1") { + return Kernel_128x128_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "128x64_1x1x1") { + return Kernel_128x64_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "128x32_1x1x1") { + return Kernel_128x32_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } else if (schedule == "128x16_1x1x1") { + return Kernel_128x16_1x1x1::mm(A, B, group_scales, group_size, + channel_scales, token_scales, + maybe_out_type); + } + TORCH_CHECK(false, "Unknown W4A8 schedule: ", schedule); + return {}; +} + +torch::Tensor mm(torch::Tensor const& A, + torch::Tensor const& B, // already packed + torch::Tensor const& group_scales, // already packed + int64_t group_size, torch::Tensor const& channel_scales, + torch::Tensor const& token_scales, + std::optional const& maybe_out_type, + std::optional maybe_schedule) { + // requested a specific schedule + if (maybe_schedule) { + return mm_dispatch(A, B, group_scales, group_size, channel_scales, + token_scales, maybe_out_type, *maybe_schedule); + } + std::string schedule; + int M = A.size(0); + int K = A.size(1); + int N = B.size(1); + // heuristic + if (M <= 16) { + schedule = (K == 16384 && N == 18432) ? "256x16_1x1x1" : "128x16_1x1x1"; + } else if (M <= 32) { + schedule = (K == 16384 && N == 18432) ? "256x32_1x1x1" : "128x32_1x1x1"; + } else if (M <= 64) { + if (K == 16384 && N == 18432) + schedule = "256x64_1x1x1"; + else if (N <= 8192 && K <= 8192) + schedule = "128x32_1x1x1"; + else + schedule = "128x64_1x1x1"; + } else if (M <= 128) { + if (K == 16384 && N == 18432) + schedule = "256x128_1x1x1"; + else if (N <= 8192) + schedule = "128x64_1x1x1"; + else + schedule = "128x128_1x1x1"; + } else if (M <= 256) { + if (N <= 4096) + schedule = "128x64_1x1x1"; + else if (N <= 8192) + schedule = "128x128_1x1x1"; + else + schedule = "128x256_1x1x1"; + } else if (M <= 512 && N <= 4096) { + schedule = "128x128_1x1x1"; + } else if (M <= 1024) { + schedule = "128x256_1x1x1"; + } else { + schedule = "128x256_2x1x1"; + } + return mm_dispatch(A, B, group_scales, group_size, channel_scales, + token_scales, maybe_out_type, schedule); +} + +// ---------------------------------------------------------------------------- +// Pre-processing utils +// ---------------------------------------------------------------------------- +torch::Tensor pack_scale_fp8(torch::Tensor const& scales) { + TORCH_CHECK(scales.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(scales.is_contiguous()); + TORCH_CHECK(scales.is_cuda()); + + auto packed_scales = torch::empty( + {scales.numel() * ScalePackSize}, + torch::TensorOptions().dtype(scales.dtype()).device(scales.device())); + auto scales_ptr = static_cast(scales.const_data_ptr()); + auto packed_scales_ptr = + static_cast*>( + packed_scales.data_ptr()); + + cutlass::pack_scale_fp8(scales_ptr, packed_scales_ptr, scales.numel()); + + return packed_scales; +} + +/* + GPU-accelerated implementation of cutlass::unified_encode_int4b. + Constructs a lookup table in constant memory to map 8 bits + (two 4-bit values) at a time. Assumes memory is contiguous + and pointers are 16-byte aligned. +*/ +__constant__ uint8_t kNibbleLUT[256]; + +__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out, + size_t nbytes) { + constexpr size_t V = sizeof(uint4); // 16 bytes + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t nthreads = size_t(gridDim.x) * blockDim.x; + const size_t nvec = nbytes / V; + + // 1-D grid-stride loop over 16-byte chunks + for (size_t vec = tid; vec < nvec; vec += nthreads) { + uint4 v = reinterpret_cast(in)[vec]; + uint8_t* b = reinterpret_cast(&v); +#pragma unroll + for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]]; + reinterpret_cast(out)[vec] = v; + } +} + +static bool upload_lut() { + std::array lut{}; + auto map_nib = [](uint8_t v) -> uint8_t { + // 1..7 -> (8 - v); keep 0 and 8..15 + return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v); + }; + for (int b = 0; b < 256; ++b) { + uint8_t lo = b & 0xF; + uint8_t hi = (b >> 4) & 0xF; + lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo)); + } + cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(), + /*offset=*/0, cudaMemcpyHostToDevice); + + return (e == cudaSuccess); +} + +static bool unified_encode_int4b(cutlass::int4b_t const* in, + cutlass::int4b_t* out, size_t num_int4_elems) { + // Build/upload LUT + if (!upload_lut()) return false; + + static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1, + "int4 storage must be 1 byte"); + const size_t nbytes = num_int4_elems >> 1; + + auto* in_bytes = reinterpret_cast(in); + auto* out_bytes = reinterpret_cast(out); + + // kernel launch params + constexpr int block = 256; + const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors + int grid = int((nvec + block - 1) / block); + if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel + + unified_encode_int4b_device<<>>(in_bytes, out_bytes, nbytes); + cudaError_t err = cudaGetLastError(); + return (err == cudaSuccess); +} + +torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { + TORCH_CHECK(B.dtype() == torch::kInt32); + TORCH_CHECK(B.dim() == 2); + + torch::Tensor B_packed = torch::empty_like(B); + + int k = B.size(0) * PackFactor; // logical k + int n = B.size(1); + TORCH_CHECK((n * k) % 32 == 0, "need multiples of 32 int4s for 16B chunks"); + + auto B_ptr = static_cast(B.const_data_ptr()); + auto B_packed_ptr = static_cast(B_packed.data_ptr()); + auto shape_B = cute::make_shape(n, k, 1); + auto layout_B = make_layout(shape_B, LayoutRight{}); // row major + LayoutB_Reordered layout_B_reordered = + cute::tile_to_shape(LayoutAtomQuant{}, shape_B); + + bool ok = + vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k); + TORCH_CHECK(ok, "unified_encode_int4b failed"); + cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered); + + return B_packed; +} + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("cutlass_w4a8_mm", &mm); + m.impl("cutlass_pack_scale_fp8", &pack_scale_fp8); + m.impl("cutlass_encode_and_reorder_int4b", &encode_and_reorder_int4b); +} + +} // namespace vllm::cutlass_w4a8 \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh deleted file mode 100644 index e089c3d4be2c..000000000000 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh +++ /dev/null @@ -1,194 +0,0 @@ -#pragma once - -#include "cutlass/cutlass.h" -#include "cutlass/numeric_types.h" - -#include "cute/tensor.hpp" -#include "cutlass/tensor_ref.h" -#include "cutlass/gemm/dispatch_policy.hpp" -#include "cutlass/gemm/collective/collective_builder.hpp" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/kernel/gemm_universal.hpp" -#include "cutlass/gemm/kernel/tile_scheduler_params.h" -#include "cutlass/epilogue/dispatch_policy.hpp" -#include "cutlass/epilogue/collective/collective_builder.hpp" - -#include "cutlass_extensions/gemm/dispatch_policy.hpp" -#include "cutlass_extensions/gemm/collective/collective_builder.hpp" - -#include "cutlass_gemm_caller.cuh" - -namespace vllm { - -using namespace cute; - -template > -struct cutlass_3x_gemm_fp8_blockwise { - using GroupSizeM = Int; - using GroupSizeN = Int; - using GroupSizeK = Int; - using TileSizeM = Int; - - static_assert(TileSizeM_ % GroupSizeM_ == 0, - "TileSizeM must be a multiple of GroupSizeM"); - - using ElementAB = cutlass::float_e4m3_t; - - using ElementA = ElementAB; - using LayoutA = cutlass::layout::RowMajor; - static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; - - using ElementB = ElementAB; - using LayoutB = cutlass::layout::ColumnMajor; - static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; - - using ElementD = OutType; - using StrideD = Stride, Int<0>>; - static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; - - using ElementC = void; - using StrideC = StrideD; - static constexpr int AlignmentC = AlignmentD; - - using ElementAccumulator = float; - using ElementBlockScale = float; - using ElementCompute = float; - using ArchTag = cutlass::arch::Sm90; - using OperatorClass = cutlass::arch::OpClassTensorOp; - using TileShape = Shape; - - using KernelSchedule = cutlass::gemm:: - KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum< - GroupSizeM_>; - using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative; - using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; - - using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT< - cutlass::epilogue::fusion::Sm90AccFetch>; - - using CollectiveEpilogue = - typename cutlass::epilogue::collective::CollectiveBuilder< - ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, - ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC, - ElementD, StrideD, AlignmentD, EpilogueSchedule, - StoreEpilogueCompute>::CollectiveOp; - - using CollectiveMainloop = - typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB, - LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape, - cutlass::gemm::collective::StageCountAutoCarveout( - sizeof(typename CollectiveEpilogue::SharedStorage))>, - KernelSchedule>::CollectiveOp; - - using KernelType = enable_sm90_or_later, CollectiveMainloop, CollectiveEpilogue, - SchedulerType>>; - - struct GemmKernel : public KernelType {}; - - using StrideA = typename GemmKernel::StrideA; - using StrideB = typename GemmKernel::StrideB; -}; - -template -void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - using GemmKernel = typename Gemm::GemmKernel; - - using ElementAB = typename Gemm::ElementAB; - using ElementD = typename Gemm::ElementD; - - auto prob_shape = c3x::get_problem_shape(a, b); - int32_t m = get<0>(prob_shape), n = get<1>(prob_shape), - k = get<2>(prob_shape); - - int64_t lda = a.stride(0); - int64_t ldb = b.stride(1); - int64_t ldc = out.stride(0); - - using StrideA = Stride, int64_t>; - using StrideB = Stride, int64_t>; - using StrideC = typename Gemm::StrideC; - - StrideA a_stride{lda, Int<1>{}, 0}; - StrideB b_stride{ldb, Int<1>{}, 0}; - StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; - - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); - auto a_scales_ptr = static_cast(a_scales.data_ptr()); - auto b_scales_ptr = static_cast(b_scales.data_ptr()); - - // Check is the t is contiguous and is 1D or 2D with one of the dimensions - // being 1 (i.e. a row or column vector) - auto is_contiguous_vector = [](const torch::Tensor& t) { - auto t_sizes = t.sizes(); - return t.is_contiguous() && - (t.dim() == 1 || - (t.dim() == 2 && - *std::min_element(t_sizes.begin(), t_sizes.end()) == 1)); - }; - - // TODO(lucas): lets clean-up the kernel so that we pass in Strides so - // we don't have to deal with enforcing implicit layouts - TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value); - TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value); - TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales), - "a_scales must be M major"); - TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value); - TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value); - TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales), - "b_scales must be K major"); - typename GemmKernel::MainloopArguments mainloop_args{ - a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr}; - - auto c_ptr = static_cast(out.data_ptr()); - typename GemmKernel::EpilogueArguments epilogue_args{ - {}, c_ptr, c_stride, c_ptr, c_stride}; - - typename GemmKernel::TileSchedulerArguments scheduler; - - static constexpr bool UsesStreamKScheduler = - cute::is_same_v; - - if constexpr (UsesStreamKScheduler) { - using DecompositionMode = typename cutlass::gemm::kernel::detail:: - PersistentTileSchedulerSm90StreamKParams::DecompositionMode; - using ReductionMode = typename cutlass::gemm::kernel::detail:: - PersistentTileSchedulerSm90StreamKParams::ReductionMode; - - scheduler.decomposition_mode = DecompositionMode::StreamK; - scheduler.reduction_mode = ReductionMode::Nondeterministic; - } - - c3x::cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, - epilogue_args, scheduler); -} - -template -void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out, - torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - auto k = a.size(1); - auto n = b.size(1); - - if (k > 3 * n) { - cutlass_gemm_caller_blockwise>( - out, a, b, a_scales, b_scales); - } else { - cutlass_gemm_caller_blockwise>( - out, a, b, a_scales, b_scales); - } -} - -} // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp deleted file mode 100644 index 2ee6a19407f9..000000000000 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include "cuda_utils.h" -#include "cutlass_extensions/common.hpp" - -template -void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a, - torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - std::optional const& bias, - Fp8Func fp8_func, Int8Func int8_func, - BlockwiseFunc blockwise_func) { - TORCH_CHECK(a_scales.dtype() == torch::kFloat32); - TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - - int M = a.size(0), N = b.size(1), K = a.size(1); - - if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) && - (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) { - // Standard per-tensor/per-token/per-channel scaling - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - if (a.dtype() == torch::kFloat8_e4m3fn) { - fp8_func(c, a, b, a_scales, b_scales, bias); - } else { - TORCH_CHECK(a.dtype() == torch::kInt8); - if constexpr (!std::is_same_v) { - int8_func(c, a, b, a_scales, b_scales, bias); - } else { - TORCH_CHECK(false, "Int8 not supported for this architecture"); - } - } - } else { - TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor."); - TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor."); - int32_t version_num = get_sm_version_num(); - if (version_num >= 100) { - TORCH_CHECK( - a.size(0) == a_scales.size(0) && - cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1), - "a_scale_group_shape must be [1, 128]."); - TORCH_CHECK( - cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) && - cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1), - "b_scale_group_shape must be [128, 128]."); - } else { - // TODO: Remove this after using cutlass sm90 blockwise scaling gemm - // kernel, or introducing ceil_div to the load_init() of mainloop. - using GroupShape = std::array; - auto make_group_shape = [](torch::Tensor const& x, - torch::Tensor const& s) -> GroupShape { - TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D"); - return {cuda_utils::ceil_div(x.size(0), s.size(0)), - cuda_utils::ceil_div(x.size(1), s.size(1))}; - }; - - GroupShape a_scale_group_shape = make_group_shape(a, a_scales); - GroupShape b_scale_group_shape = make_group_shape(b, b_scales); - - // 1x128 per-token group scales for activations - // 128x128 blockwise scales for weights - TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} && - b_scale_group_shape == GroupShape{128, 128} && - a.dtype() == torch::kFloat8_e4m3fn && - b.dtype() == torch::kFloat8_e4m3fn), - "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n" - "a_scale_group_shape must be [1, 128]. Got: [", - a_scale_group_shape[0], ", ", a_scale_group_shape[1], - "]\n" - "b_scale_group_shape must be [128, 128]. Got: [", - b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]"); - } - - TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm"); - blockwise_func(c, a, b, a_scales, b_scales); - } -} diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu deleted file mode 100644 index e092c61abc24..000000000000 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu +++ /dev/null @@ -1,24 +0,0 @@ -#include "scaled_mm_kernels.hpp" -#include "scaled_mm_sm90_fp8_dispatch.cuh" -#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" - -namespace vllm { - -void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - std::optional const& bias) { - TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); - if (bias) { - TORCH_CHECK(bias->dtype() == out.dtype(), - "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm90_fp8_epilogue( - out, a, b, a_scales, b_scales, *bias); - } else { - return cutlass_scaled_mm_sm90_fp8_epilogue( - out, a, b, a_scales, b_scales); - } -} - -} // namespace vllm diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh deleted file mode 100644 index 32ea5db3321b..000000000000 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh +++ /dev/null @@ -1,120 +0,0 @@ -#pragma once - -#include "scaled_mm.cuh" -#include "cutlass_gemm_caller.cuh" - -/** - * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm - * shape. - */ - -namespace vllm { - -using c3x::cutlass_gemm_caller; - -template typename Epilogue> -struct sm90_fp8_config_default { - // M in (128, inf) - static_assert(std::is_same()); - using KernelSchedule = - cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; - using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; - using TileShape = Shape<_128, _128, _128>; - using ClusterShape = Shape<_2, _1, _1>; - using Cutlass3xGemm = - cutlass_3x_gemm; -}; - -template typename Epilogue> -struct sm90_fp8_config_M128 { - // M in (64, 128] - static_assert(std::is_same()); - using KernelSchedule = - cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; - using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; - using TileShape = Shape<_64, _128, _128>; - using ClusterShape = Shape<_2, _1, _1>; - using Cutlass3xGemm = - cutlass_3x_gemm; -}; - -template typename Epilogue> -struct sm90_fp8_config_M64 { - // M in [1, 64] - static_assert(std::is_same()); - using KernelSchedule = - cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum; - using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized; - using TileShape = Shape<_64, _64, _128>; - using ClusterShape = Shape<_1, _8, _1>; - - using Cutlass3xGemm = - cutlass_3x_gemm; -}; - -template typename Epilogue, - typename... EpilogueArgs> -inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, - torch::Tensor const& a, - torch::Tensor const& b, - EpilogueArgs&&... args) { - static_assert(std::is_same()); - TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); - TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); - - using Cutlass3xGemmDefault = - typename sm90_fp8_config_default::Cutlass3xGemm; - using Cutlass3xGemmM64 = - typename sm90_fp8_config_M64::Cutlass3xGemm; - using Cutlass3xGemmM128 = - typename sm90_fp8_config_M128::Cutlass3xGemm; - - uint32_t const m = a.size(0); - uint32_t const mp2 = - std::max(static_cast(64), next_pow_2(m)); // next power of 2 - - if (mp2 <= 64) { - // m in [1, 64] - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else if (mp2 <= 128) { - // m in (64, 128] - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else { - // m in (128, inf) - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } -} - -template