From b5eac5245d766d9a480503d50efb03a97c482e67 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 19 Oct 2025 08:53:55 +0000 Subject: [PATCH 01/48] [Benchmark] Add plot utility for parameter sweep Signed-off-by: DarkLight1337 --- docs/contributing/benchmarks.md | 25 +- vllm/benchmarks/sweep/__init__.py | 0 vllm/benchmarks/sweep/param_sweep.py | 83 +++ vllm/benchmarks/sweep/plot.py | 180 ++++++ vllm/benchmarks/sweep/serve.py | 824 +++++++++++++++++++++++++++ vllm/benchmarks/sweep/server.py | 114 ++++ vllm/benchmarks/sweep/sla_sweep.py | 129 +++++ 7 files changed, 1349 insertions(+), 6 deletions(-) create mode 100644 vllm/benchmarks/sweep/__init__.py create mode 100644 vllm/benchmarks/sweep/param_sweep.py create mode 100644 vllm/benchmarks/sweep/plot.py create mode 100644 vllm/benchmarks/sweep/serve.py create mode 100644 vllm/benchmarks/sweep/server.py create mode 100644 vllm/benchmarks/sweep/sla_sweep.py diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 52a16d7bdbff..157241d331bb 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -7,7 +7,7 @@ toc_depth: 4 vLLM provides comprehensive benchmarking tools for performance testing and evaluation: - **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing -- **[Batch Scripts](#batch-scripts)**: Run `vllm bench` against multiple configurations conveniently +- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development - **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives @@ -925,11 +925,11 @@ throughput numbers correctly is also adjusted. -## Batch Scripts +## Parameter Sweeps -### Batch Serving Script +### Online Benchmark -[`vllm/benchmarks/serve_multi.py`](../../vllm/benchmarks/serve_multi.py) automatically starts `vllm serve` and runs `vllm bench serve` over multiple configurations. +[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` over multiple configurations. #### Batch Mode @@ -996,7 +996,7 @@ The basic purpose of this script is to evaluate vLLM under different settings. F Example command: ```bash -python vllm/benchmarks/serve_multi.py \ +python vllm/benchmarks/sweep/serve.py \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \ --serve-params benchmarks/serve_hparams.json \ @@ -1044,7 +1044,7 @@ For example, to ensure E2E latency within different target values for 99% of req Example command: ```bash -python vllm/benchmarks/serve_multi.py \ +python vllm/benchmarks/sweep/serve.py \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \ --serve-params benchmarks/serve_hparams.json \ @@ -1066,6 +1066,19 @@ The algorithm for adjusting the SLA variable is as follows: For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value. +### Visualizer + +[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results. + +Example command: + +```bash +python vllm/benchmarks/sweep/plot.py benchmarks/results/ --curve-by api_server_count,max_num_batched_tokens --file-by random_input_len,random_output_len +``` + +!!! tip + You can use `--dry-run` to preview the commands to be run. + ## Performance Benchmarks The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. diff --git a/vllm/benchmarks/sweep/__init__.py b/vllm/benchmarks/sweep/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py new file mode 100644 index 000000000000..bddf0b98ae29 --- /dev/null +++ b/vllm/benchmarks/sweep/param_sweep.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +import os +from typing import Any + + +class ParameterSweep(list["ParameterSweepItem"]): + @classmethod + def read_json(cls, filepath: os.PathLike): + with open(filepath, "rb") as f: + records = json.load(f) + + return cls.from_records(records) + + @classmethod + def from_records(cls, records: list[dict[str, object]]): + if not isinstance(records, list): + raise TypeError( + f"The parameter sweep should be a list of dictionaries, " + f"but found type: {type(records)}" + ) + + return cls(ParameterSweepItem.from_record(record) for record in records) + + +class ParameterSweepItem(dict[str, object]): + @classmethod + def from_record(cls, record: dict[str, object]): + if not isinstance(record, dict): + raise TypeError( + f"Each item in the parameter sweep should be a dictionary, " + f"but found type: {type(record)}" + ) + + return cls(record) + + def __or__(self, other: dict[str, Any]): + return type(self)(self | other) + + # In JSON, we prefer "_" + def _iter_param_key_candidates(self, param_key: str): + yield param_key + yield param_key.replace("-", "_") + yield param_key.replace("_", "-") + + # In CLI, we prefer "-" + def _iter_cmd_key_candidates(self, param_key: str): + for k in reversed(tuple(self._iter_param_key_candidates(param_key))): + yield "--" + k + + def _normalize_cmd_key(self, param_key: str): + return next(self._iter_cmd_key_candidates(param_key)) + + def has_param(self, param_key: str) -> bool: + return any(k in self for k in self._iter_param_key_candidates(param_key)) + + def apply_to_cmd(self, cmd: list[str]) -> list[str]: + cmd = list(cmd) + + for k, v in self.items(): + for k_candidate in self._iter_cmd_key_candidates(k): + try: + k_idx = cmd.index(k_candidate) + + if isinstance(v, bool): + cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k) + else: + cmd[k_idx + 1] = str(v) + + break + except ValueError: + continue + else: + if isinstance(v, bool): + cmd.append(self._normalize_cmd_key(k if v else "no-" + k)) + else: + cmd.extend([self._normalize_cmd_key(k), str(v)]) + + return cmd + + def as_text(self, sep: str = ", ") -> str: + return sep.join(f"{k}={v}" for k, v in self.items()) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py new file mode 100644 index 000000000000..6c9e2c9d6179 --- /dev/null +++ b/vllm/benchmarks/sweep/plot.py @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import json +from pathlib import Path + +import pandas as pd +import seaborn as sns + +from vllm.utils.collections import full_groupby + + +def _json_load_bytes(path: Path) -> list[dict[str, object]]: + with path.open("rb") as f: + return json.load(f) + + +def _plot_fig( + fig_path: Path, + fig_data: list[dict[str, object]], + curve_by: list[str], + *, + var_x: str, + var_y: str, + bin_y: float, + dry_run: bool, +): + print("[BEGIN FIGURE]") + print(f"Output file: {fig_path}") + + if dry_run: + print("[END FIGURE]") + return + + df = pd.DataFrame.from_records(fig_data) + df[var_y] = df[var_y] // bin_y * bin_y + + if len(curve_by) <= 3: + hue, style, size, *_ = (*curve_by, None, None) + ax = sns.lineplot( + df, + x=var_x, + y=var_y, + hue=hue, + style=style, + size=size, + markers=True, + ) + else: + df["params"] = df[list(curve_by)].agg("-".join, axis=1) + ax = sns.lineplot( + df, + x=var_x, + y=var_y, + hue="params", + markers=True, + ) + + sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) + + fig = ax.get_figure() + assert fig is not None + + fig.tight_layout() + fig.savefig(fig_path) + + print("[END FIGURE]") + + +def plot( + output_dir: Path, + fig_by: list[str], + curve_by: list[str], + *, + var_x: str, + var_y: str, + bin_y: float, + dry_run: bool, +): + all_data = [ + run_data + for path in output_dir.rglob("**/summary.json") + for run_data in _json_load_bytes(path) + ] + + for fig_group, fig_data in full_groupby( + all_data, + key=lambda item: tuple((k, str(item[k])) for k in fig_by), + ): + fig_path = output_dir / ( + "-".join( + ( + "FIGURE", + *(f"{k}={v}" for k, v in fig_group), + ) + ) + .replace("/", "_") + .replace("..", "__") # Sanitize + + ".png" + ) + + _plot_fig( + fig_path, + fig_data, + curve_by, + var_x=var_x, + var_y=var_y, + bin_y=bin_y, + dry_run=dry_run, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Plot performance curves from parameter sweep results." + ) + parser.add_argument( + "OUTPUT_DIR", + type=str, + default="results", + help="The directory containing the results to plot. " + "Figures will be saved to the same directory.", + ) + parser.add_argument( + "--curve-by", + type=str, + required=True, + help="A comma-separated list of variables, such that a separate curve " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--fig-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate figure " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--var-x", + type=str, + default="request_throughput", + help="The variable for the x-axis.", + ) + parser.add_argument( + "--var-y", + type=str, + default="p99_e2el_ms", + help="The variable for the y-axis", + ) + parser.add_argument( + "--bin-y", + type=float, + default=1, + help="Points with y-axis values in the same bin are grouped togther " + "to reduce noise.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the location of the figures without drawing them.", + ) + + args = parser.parse_args() + + curve_by = [] if not args.curve_by else args.curve_by.split(",") + fig_by = [] if not args.fig_by else args.fig_by.split(",") + + plot( + output_dir=Path(args.OUTPUT_DIR), + fig_by=fig_by, + curve_by=curve_by, + var_x=args.var_x, + var_y=args.var_y, + bin_y=args.bin_y, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py new file mode 100644 index 000000000000..960c229c4999 --- /dev/null +++ b/vllm/benchmarks/sweep/serve.py @@ -0,0 +1,824 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import contextlib +import json +import math +import shlex +from datetime import datetime +from pathlib import Path +from typing import Literal, get_args + +import pandas as pd +from typing_extensions import assert_never + +from .param_sweep import ParameterSweep, ParameterSweepItem +from .server import ServerProcess +from .sla_sweep import SLASweep, SLASweepItem + + +@contextlib.contextmanager +def _run_server( + serve_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_overrides: ParameterSweepItem, + dry_run: bool, +): + server_cmd = serve_overrides.apply_to_cmd(serve_cmd) + + print("[BEGIN SERVER]") + print(f"Server overrides: {serve_overrides}") + print(f"Server command: {server_cmd}") + + if dry_run: + yield None + print("[END SERVER]") + return + + with ServerProcess(server_cmd, after_bench_cmd, show_stdout=show_stdout) as server: + yield server + + print("[END SERVER]") + + +def _run_benchmark( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_overrides: ParameterSweepItem, + bench_overrides: ParameterSweepItem, + run_number: int, + output_path: Path, + dry_run: bool, +): + benchmark_cmd = [ + *bench_overrides.apply_to_cmd(bench_cmd), + "--save-result", + "--result-dir", + str(output_path.parent), + "--result-filename", + output_path.name, + ] + + print("[BEGIN BENCHMARK]") + print(f"Benchmark overrides: {bench_overrides}") + print(f"Run Number: {run_number}") + print(f"Benchmark command: {benchmark_cmd}") + print(f"Output file: {output_path}") + + run_data: dict[str, object] + + if output_path.exists(): + print("Found existing results. Skipping.") + + with output_path.open("rb") as f: + run_data = json.load(f) + return run_data + + if server is None: + assert dry_run + print("[END BENCHMARK]") + return None + + output_path.parent.mkdir(parents=True, exist_ok=True) + + server.run_subcommand(benchmark_cmd) + server.after_bench() + + with output_path.open("rb") as f: + run_data = json.load(f) + + run_data["run_number"] = run_number + run_data.update(serve_overrides) + + with output_path.open("w") as f: + json.dump(run_data, f, indent=4) + + print("[END BENCHMARK]") + + return run_data + + +def _get_comb_base_path( + output_dir: Path, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, +): + return output_dir / "-".join( + ( + "SERVE", + serve_comb.as_text(sep="-"), + "BENCH", + bench_comb.as_text(sep="-"), + ) + ).replace("/", "_").replace("..", "__") # Sanitize + + +def _get_comb_run_path(base_path: Path, run_number: int | None): + if run_number is None: + return base_path / "summary.json" + + return base_path / f"run={run_number}.json" + + +def _comb_needs_server( + serve_comb: ParameterSweepItem, + bench_combs: ParameterSweep, + output_dir: Path, +): + for bench_comb in bench_combs: + base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb) + if not _get_comb_run_path(base_path, run_number=None).exists(): + return True + + return False + + +def _run_comb( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, +): + comb_data = list[dict[str, object]]() + + for run_number in range(num_runs): + run_data = _run_benchmark( + server, + bench_cmd, + serve_overrides=serve_comb, + bench_overrides=bench_comb, + run_number=run_number, + output_path=_get_comb_run_path(base_path, run_number), + dry_run=dry_run, + ) + + if run_data is not None: + comb_data.append(run_data) + + if dry_run: + return None + + with _get_comb_run_path(base_path, run_number=None).open("w") as f: + json.dump(comb_data, f, indent=4) + + return comb_data + + +def run_combs( + serve_cmd: list[str], + bench_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_params: ParameterSweep, + bench_params: ParameterSweep, + output_dir: Path, + num_runs: int, + dry_run: bool, +): + all_data = list[dict[str, object]]() + for serve_comb in serve_params: + with ( + _run_server( + serve_cmd, + after_bench_cmd, + show_stdout=show_stdout, + serve_overrides=serve_comb, + dry_run=dry_run, + ) + if _comb_needs_server(serve_comb, bench_params, output_dir) + else contextlib.nullcontext() + ) as server: + for bench_comb in bench_params: + base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb) + + comb_data = _run_comb( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + ) + + if comb_data is not None: + all_data.extend(comb_data) + + if dry_run: + return None + + combined_df = pd.DataFrame.from_records(all_data) + combined_df.to_csv(output_dir / "summary.csv") + + return combined_df + + +def _get_sla_base_path( + output_dir: Path, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, +): + return output_dir / "-".join( + ( + "SERVE", + serve_comb.as_text(sep="-"), + "BENCH", + bench_comb.as_text(sep="-"), + ) + ).replace("/", "_").replace("..", "__") # Sanitize + + +def _get_sla_iter_path( + base_path: Path, + sla_comb: SLASweepItem, + sla_variable: str, + sla_value: int | None, +): + if sla_value is None: + prefix = sla_comb.as_text(sep="-") + return base_path / f"SLA-{prefix}.json" + + return base_path / f"{sla_variable}={sla_value}" + + +def _get_sla_run_path(iter_path: Path, run_number: int | None): + if run_number is None: + return iter_path / "summary.json" + + return iter_path / f"run={run_number}.json" + + +def _sla_needs_server( + serve_comb: ParameterSweepItem, + bench_combs: ParameterSweep, + sla_combs: SLASweep, + sla_variable: str, + output_dir: Path, +): + for bench_comb in bench_combs: + base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) + for sla_comb in sla_combs: + if not _get_sla_iter_path( + base_path, + sla_comb, + sla_variable, + sla_value=None, + ).exists(): + return True + + return False + + +def _run_sla( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + iter_path: Path, + num_runs: int, + dry_run: bool, +): + iter_data = list[dict[str, object]]() + + for run_number in range(num_runs): + run_data = _run_benchmark( + server, + bench_cmd, + serve_overrides=serve_comb, + bench_overrides=bench_comb, + run_number=run_number, + output_path=_get_sla_run_path(iter_path, run_number), + dry_run=dry_run, + ) + + if run_data is not None: + iter_data.append(run_data) + + if dry_run: + return None + + with _get_sla_run_path(iter_path, run_number=None).open("w") as f: + json.dump(iter_data, f, indent=4) + + return iter_data + + +SLAVariable = Literal["request_rate", "max_concurrency"] + + +def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable): + request_throughput = float(run_data["request_throughput"]) # type: ignore + if sla_variable == "request_rate": + return request_throughput + if sla_variable == "max_concurrency": + mean_latency_ms = float(run_data["mean_e2el_ms"]) # type: ignore + return request_throughput * mean_latency_ms / 1000 + + assert_never(sla_variable) + + +def _estimate_sla_bounds( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, + sla_variable: SLAVariable, + init_value: int, + max_value: int, +): + sla_data = list[dict[str, object]]() + + max_passing: int = 0 + min_failing: int = 0 + + val: int = init_value + assert val > 0 + + while True: + print(f"Testing {sla_variable}: {val} req/s") + + iter_data = _run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: val}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), + num_runs=num_runs, + dry_run=dry_run, + ) + + assert iter_data is not None + sla_data.extend(iter_data) + + iter_data_mean = { + k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore + for k in sla_comb + } + + sla_results = [ + criterion.print_and_validate(iter_data_mean, k) + for k, criterion in sla_comb.items() + ] + + if all(sla_results): + print("SLA criteria are met.") + max_passing = val + val *= 2 + else: + print("SLA criteria are not met.") + min_failing = val + break + + if val >= max_value: + break + + return sla_data, (max_passing, min_failing) + + +def _find_sla_value( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, + sla_variable: SLAVariable, + min_value: int, + max_value: int, +): + sla_data = list[dict[str, object]]() + + left: int = min_value + right: int = max_value + + while True: + val = (left + right) // 2 + print(f"Testing {sla_variable}: {val} req/s") + + iter_data = _run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: val}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), + num_runs=num_runs, + dry_run=dry_run, + ) + + assert iter_data is not None + sla_data.extend(iter_data) + + iter_data_mean = { + k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore + for k in sla_comb + } + + sla_results = [ + criterion.print_and_validate(iter_data_mean, k) + for k, criterion in sla_comb.items() + ] + + if all(sla_results): + print("SLA criteria are met.") + left = val + else: + print("SLA criteria are not met.") + right = val + + if right - left <= 1: + break + + return sla_data, left + + +def _search_sla( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + sla_variable: SLAVariable, + sla_inf_value: int = 65536, # The value that represents infinite QPS + base_path: Path, + num_runs: int, + dry_run: bool, +): + print("[SLA START]") + print(f"SLA criteria: {sla_comb.as_text()}") + + sla_data_0 = _run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: sla_inf_value}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value), + num_runs=num_runs, + dry_run=dry_run, + ) + if sla_data_0 is None: + assert dry_run + print("Omitting SLA search.") + print("[SLA END]") + return None + + sla_init_value = math.ceil( + sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0) + / len(sla_data_0) + ) + print(f"Initial {sla_variable} to search: {sla_init_value} req/s.") + + sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + sla_variable=sla_variable, + init_value=sla_init_value, + max_value=sla_inf_value, + ) + print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.") + + sla_data_2, sla_value = _find_sla_value( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + sla_variable=sla_variable, + min_value=sla_min, + max_value=sla_max, + ) + + sla_data = sla_data_0 + sla_data_1 + sla_data_2 + print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.") + + with _get_sla_iter_path( + base_path, + sla_comb, + sla_variable, + sla_value=None, + ).open("w") as f: + json.dump(sla_data, f, indent=4) + + print("[SLA END]") + + return sla_data + + +def run_slas( + serve_cmd: list[str], + bench_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_params: ParameterSweep, + bench_params: ParameterSweep, + sla_params: SLASweep, + sla_variable: SLAVariable, + output_dir: Path, + num_runs: int, + dry_run: bool, +): + if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params): + raise ValueError( + f"You should not override `{sla_variable}` in `bench_params` in SLA mode, " + "since it is supposed to be determined automatically." + ) + + all_data = list[dict[str, object]]() + for serve_comb in serve_params: + with ( + _run_server( + serve_cmd, + after_bench_cmd, + show_stdout=show_stdout, + serve_overrides=serve_comb, + dry_run=dry_run, + ) + if _sla_needs_server( + serve_comb, + bench_params, + sla_params, + sla_variable, + output_dir, + ) + else contextlib.nullcontext() + ) as server: + for bench_comb in bench_params: + for sla_comb in sla_params: + base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) + + comb_data = _search_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + sla_variable=sla_variable, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + ) + + if comb_data is not None: + all_data.extend(comb_data) + + if dry_run: + return None + + combined_df = pd.DataFrame.from_records(all_data) + combined_df.to_csv(output_dir / "summary.csv") + + return combined_df + + +def _run_main( + serve_cmd: list[str], + bench_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_params: ParameterSweep, + bench_params: ParameterSweep, + sla_params: SLASweep, + sla_variable: SLAVariable, + output_dir: Path, + num_runs: int, + dry_run: bool, +): + if sla_params: + return run_slas( + serve_cmd=serve_cmd, + bench_cmd=bench_cmd, + after_bench_cmd=after_bench_cmd, + show_stdout=show_stdout, + serve_params=serve_params, + bench_params=bench_params, + sla_params=sla_params, + sla_variable=sla_variable, + output_dir=output_dir, + num_runs=num_runs, + dry_run=dry_run, + ) + + return run_combs( + serve_cmd=serve_cmd, + bench_cmd=bench_cmd, + after_bench_cmd=after_bench_cmd, + show_stdout=show_stdout, + serve_params=serve_params, + bench_params=bench_params, + output_dir=output_dir, + num_runs=num_runs, + dry_run=dry_run, + ) + + +def run_main( + serve_cmd: list[str], + bench_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_params: ParameterSweep, + bench_params: ParameterSweep, + sla_params: SLASweep, + sla_variable: SLAVariable, + output_dir: Path, + num_runs: int, + dry_run: bool, + resume: str | None, +): + timestamp = resume or datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = output_dir / timestamp + + if resume and not output_dir.exists(): + raise ValueError(f"Cannot resume from non-existent directory ({output_dir})") + + try: + return _run_main( + serve_cmd=serve_cmd, + bench_cmd=bench_cmd, + after_bench_cmd=after_bench_cmd, + show_stdout=show_stdout, + serve_params=serve_params, + bench_params=bench_params, + sla_params=sla_params, + sla_variable=sla_variable, + output_dir=output_dir, + num_runs=num_runs, + dry_run=dry_run, + ) + except BaseException as exc: + raise RuntimeError( + f"The script was terminated early. Use `--resume {timestamp}` " + f"to continue the script from its last checkpoint." + ) from exc + + +def main(): + parser = argparse.ArgumentParser( + description="Run vLLM server benchmark under multiple settings." + ) + parser.add_argument( + "--serve-cmd", + type=str, + required=True, + help="The command used to run the server: `vllm serve ...`", + ) + parser.add_argument( + "--bench-cmd", + type=str, + required=True, + help="The command used to run the benchmark: `vllm bench serve ...`", + ) + parser.add_argument( + "--after-bench-cmd", + type=str, + default=None, + help="After a benchmark run is complete, invoke this command instead of the " + "default `ServerWrapper.clear_cache()`.", + ) + parser.add_argument( + "--show-stdout", + action="store_true", + help="If set, logs the standard output of subcommands. " + "Useful for debugging but can be quite spammy.", + ) + parser.add_argument( + "--serve-params", + type=str, + default=None, + help="Path to JSON file containing a list of parameter combinations " + "for the `vllm serve` command. " + "If both `serve_params` and `bench_params` are given, " + "this script will iterate over their Cartesian product.", + ) + parser.add_argument( + "--bench-params", + type=str, + default=None, + help="Path to JSON file containing a list of parameter combinations " + "for the `vllm bench serve` command. " + "If both `serve_params` and `bench_params` are given, " + "this script will iterate over their Cartesian product.", + ) + parser.add_argument( + "--sla-params", + type=str, + default=None, + help="Path to JSON file containing a list of SLA constraints to satisfy. " + 'Each constraint is expressed in `{"": ""}` format, ' + 'e.g.: `{"p99_e2el_ms": "<=500"}` means that ' + "the E2E latency should be less than 500ms 99% of the time. " + "Setting this option runs this script in SLA mode, which searches for the " + "maximum `sla_variable` that satisfies the constraints for each combination " + "of `serve_params`, `bench_params`, and `sla_params`.", + ) + parser.add_argument( + "--sla-variable", + type=str, + choices=get_args(SLAVariable), + default="request_rate", + help="Whether to tune request rate or maximum concurrency to satisfy " + "the SLA constraints.", + ) + parser.add_argument( + "-o", + "--output-dir", + type=str, + default="results", + help="The directory to which results are written.", + ) + parser.add_argument( + "--num-runs", + type=int, + default=3, + help="Number of runs per parameter combination.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the commands to run then exits without running them.", + ) + parser.add_argument( + "--resume", + type=str, + default=None, + help="Set this to the name of a directory under `output_dir` (which is a " + "timestamp) to resume a previous execution of this script, i.e., only run " + "parameter combinations for which there are still no output files.", + ) + + args = parser.parse_args() + + serve_cmd = shlex.split(args.serve_cmd) + bench_cmd = shlex.split(args.bench_cmd) + after_bench_cmd = ( + [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd) + ) + + if args.serve_params: + serve_params = ParameterSweep.read_json(args.serve_params) + else: + # i.e.: run serve_cmd without any modification + serve_params = ParameterSweep.from_records([{}]) + + if args.bench_params: + bench_params = ParameterSweep.read_json(args.bench_params) + else: + # i.e.: run bench_cmd without any modification + bench_params = ParameterSweep.from_records([{}]) + + if args.sla_params: + sla_params = SLASweep.read_json(args.sla_params) + else: + sla_params = SLASweep.from_records([]) + + num_runs = args.num_runs + if num_runs < 1: + raise ValueError("`num_runs` should be at least 1.") + + run_main( + serve_cmd=serve_cmd, + bench_cmd=bench_cmd, + after_bench_cmd=after_bench_cmd, + show_stdout=args.show_stdout, + serve_params=serve_params, + bench_params=bench_params, + sla_params=sla_params, + sla_variable=args.sla_variable, + output_dir=Path(args.output_dir), + num_runs=num_runs, + dry_run=args.dry_run, + resume=args.resume, + ) + + +if __name__ == "__main__": + main() diff --git a/vllm/benchmarks/sweep/server.py b/vllm/benchmarks/sweep/server.py new file mode 100644 index 000000000000..f17578726415 --- /dev/null +++ b/vllm/benchmarks/sweep/server.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib +import os +import signal +import subprocess +from types import TracebackType + +import requests +from typing_extensions import Self + + +class ServerProcess: + def __init__( + self, + server_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + ) -> None: + super().__init__() + + self.server_cmd = server_cmd + self.after_bench_cmd = after_bench_cmd + self.show_stdout = show_stdout + + def __enter__(self) -> Self: + self.start() + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + self.stop() + + def start(self): + # Create new process for clean termination + self._server_process = subprocess.Popen( + self.server_cmd, + start_new_session=True, + stdout=None if self.show_stdout else subprocess.DEVNULL, + # Need `VLLM_SERVER_DEV_MODE=1` for `_reset_caches` + env=os.environ | {"VLLM_SERVER_DEV_MODE": "1"}, + ) + + def stop(self): + server_process = self._server_process + + if server_process.poll() is None: + # In case only some processes have been terminated + with contextlib.suppress(ProcessLookupError): + # We need to kill both API Server and Engine processes + os.killpg(os.getpgid(server_process.pid), signal.SIGKILL) + + def run_subcommand(self, cmd: list[str]): + return subprocess.run( + cmd, + stdout=None if self.show_stdout else subprocess.DEVNULL, + check=True, + ) + + def after_bench(self) -> None: + if not self.after_bench_cmd: + self.reset_caches() + return + + self.run_subcommand(self.after_bench_cmd) + + def _get_vllm_server_address(self) -> str: + server_cmd = self.server_cmd + + for host_key in ("--host",): + if host_key in server_cmd: + host = server_cmd[server_cmd.index(host_key) + 1] + break + else: + host = "localhost" + + for port_key in ("-p", "--port"): + if port_key in server_cmd: + port = int(server_cmd[server_cmd.index(port_key) + 1]) + break + else: + port = 8000 # The default value in vllm serve + + return f"http://{host}:{port}" + + def reset_caches(self) -> None: + server_cmd = self.server_cmd + + # Use `.endswith()` to match `/bin/...` + if server_cmd[0].endswith("vllm"): + server_address = self._get_vllm_server_address() + print(f"Resetting caches at {server_address}") + + res = requests.post(f"{server_address}/reset_prefix_cache") + res.raise_for_status() + + res = requests.post(f"{server_address}/reset_mm_cache") + res.raise_for_status() + elif server_cmd[0].endswith("infinity_emb"): + if "--vector-disk-cache" in server_cmd: + raise NotImplementedError( + "Infinity server uses caching but does not expose a method " + "to reset the cache" + ) + else: + raise NotImplementedError( + f"No implementation of `reset_caches` for `{server_cmd[0]}` server. " + "Please specify a custom command via `--after-bench-cmd`." + ) diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py new file mode 100644 index 000000000000..6a58b7149a02 --- /dev/null +++ b/vllm/benchmarks/sweep/sla_sweep.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +import os +from abc import ABC, abstractmethod + +from typing_extensions import override + + +class SLACriterionBase(ABC): + def __init__(self, target: float) -> None: + super().__init__() + + self.target = target + + @abstractmethod + def validate(self, actual: float) -> bool: + """Return `True` if this criterion is met; otherwise `False`.""" + raise NotImplementedError + + @abstractmethod + def format_cond(self, lhs: str) -> str: + raise NotImplementedError + + def print_and_validate( + self, + metrics: dict[str, float], + metrics_key: str, + ) -> bool: + metric = metrics[metrics_key] + result = self.validate(metric) + + cond = self.format_cond(f"{metrics_key} = {metric:.2f}") + print(f"Validating SLA: {cond} | " + ("PASSED" if result else "FAILED")) + + return result + + +class SLALessThan(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual < self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}<{self.target:.2f}" + + +class SLALessThanOrEqual(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual <= self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}<={self.target:.2f}" + + +class SLAGreaterThan(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual > self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}>{self.target:.2f}" + + +class SLAGreaterThanOrEqual(SLACriterionBase): + @override + def validate(self, actual: float) -> bool: + return actual >= self.target + + @override + def format_cond(self, lhs: str) -> str: + return f"{lhs}>={self.target:.2f}" + + +# NOTE: The ordering is important! Match longer op_keys first +SLA_CRITERIA: dict[str, type[SLACriterionBase]] = { + "<=": SLALessThanOrEqual, + ">=": SLAGreaterThanOrEqual, + "<": SLALessThan, + ">": SLAGreaterThan, +} + + +class SLASweep(list["SLASweepItem"]): + @classmethod + def read_json(cls, filepath: os.PathLike): + with open(filepath, "rb") as f: + records = json.load(f) + + return cls.from_records(records) + + @classmethod + def from_records(cls, records: list[dict[str, str]]): + if not isinstance(records, list): + raise TypeError( + f"The SLA sweep should be a list of dictionaries, " + f"but found type: {type(records)}" + ) + + return cls(SLASweepItem.from_record(record) for record in records) + + +class SLASweepItem(dict[str, SLACriterionBase]): + @classmethod + def from_record(cls, record: dict[str, str]): + sla_criteria: dict[str, SLACriterionBase] = {} + + for metric_key, metric_value in record.items(): + for op_key in SLA_CRITERIA: + if metric_value.startswith(op_key): + sla_criteria[metric_key] = SLA_CRITERIA[op_key]( + float(metric_value.removeprefix(op_key)) + ) + break + else: + raise ValueError( + f"Invalid operator for " + f"SLA constraint '{metric_key}={metric_value}'. " + f"Valid operators are: {set(SLA_CRITERIA)}", + ) + + return cls(sla_criteria) + + def as_text(self, sep: str = ", ") -> str: + return sep.join(v.format_cond(k) for k, v in self.items()) From 7f93c36d6bea4c04ab9f78133f31fd3f1202d295 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 03:45:03 +0000 Subject: [PATCH 02/48] Update Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 6c9e2c9d6179..2cc8369bfef6 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -22,7 +22,8 @@ def _plot_fig( *, var_x: str, var_y: str, - bin_y: float, + max_x: float | None, + bin_x: float | None, dry_run: bool, ): print("[BEGIN FIGURE]") @@ -33,7 +34,12 @@ def _plot_fig( return df = pd.DataFrame.from_records(fig_data) - df[var_y] = df[var_y] // bin_y * bin_y + + if max_x is not None: + df = df[df[var_x] <= max_x] + + if bin_x is not None: + df[var_x] = df[var_x] // bin_x * bin_x if len(curve_by) <= 3: hue, style, size, *_ = (*curve_by, None, None) @@ -74,7 +80,8 @@ def plot( *, var_x: str, var_y: str, - bin_y: float, + max_x: float | None, + bin_x: float | None, dry_run: bool, ): all_data = [ @@ -105,7 +112,8 @@ def plot( curve_by, var_x=var_x, var_y=var_y, - bin_y=bin_y, + max_x=max_x, + bin_x=bin_x, dry_run=dry_run, ) @@ -148,10 +156,16 @@ def main(): help="The variable for the y-axis", ) parser.add_argument( - "--bin-y", + "--max-x", + type=float, + default=None, + help="The maximum value to plot for the x-axis.", + ) + parser.add_argument( + "--bin-x", type=float, - default=1, - help="Points with y-axis values in the same bin are grouped togther " + default=None, + help="Group together points with x-axis values in the same bin " "to reduce noise.", ) parser.add_argument( @@ -171,7 +185,8 @@ def main(): curve_by=curve_by, var_x=args.var_x, var_y=args.var_y, - bin_y=args.bin_y, + max_x=args.max_x, + bin_x=args.bin_x, dry_run=args.dry_run, ) From d52e9b9dce8ec58d5662b494f9d6af21cdd5f371 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 03:59:47 +0000 Subject: [PATCH 03/48] Add log plot Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 2cc8369bfef6..684ee38c00be 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -24,6 +24,7 @@ def _plot_fig( var_y: str, max_x: float | None, bin_x: float | None, + log_y: bool, dry_run: bool, ): print("[BEGIN FIGURE]") @@ -62,6 +63,9 @@ def _plot_fig( markers=True, ) + if log_y: + ax.set_yscale("log") + sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) fig = ax.get_figure() @@ -82,6 +86,7 @@ def plot( var_y: str, max_x: float | None, bin_x: float | None, + log_y: bool, dry_run: bool, ): all_data = [ @@ -114,6 +119,7 @@ def plot( var_y=var_y, max_x=max_x, bin_x=bin_x, + log_y=log_y, dry_run=dry_run, ) @@ -168,6 +174,11 @@ def main(): help="Group together points with x-axis values in the same bin " "to reduce noise.", ) + parser.add_argument( + "--log-y", + action="store_true", + help="Use logarithmic scaling for the y-axis.", + ) parser.add_argument( "--dry-run", action="store_true", @@ -187,6 +198,7 @@ def main(): var_y=args.var_y, max_x=args.max_x, bin_x=args.bin_x, + log_y=args.log_y, dry_run=args.dry_run, ) From 2f96852ccb97a545dd6e101e5a7278d174786100 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 04:20:51 +0000 Subject: [PATCH 04/48] Fix multifigure Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 684ee38c00be..5905807baa61 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -4,6 +4,7 @@ import json from pathlib import Path +import matplotlib.pyplot as plt import pandas as pd import seaborn as sns @@ -73,6 +74,7 @@ def _plot_fig( fig.tight_layout() fig.savefig(fig_path) + plt.close(fig) print("[END FIGURE]") From fcf156b3cea9460805c3ab5c75dd2c10a24153aa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 04:27:36 +0000 Subject: [PATCH 05/48] Update command Signed-off-by: DarkLight1337 --- docs/contributing/benchmarks.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 157241d331bb..ae8582113f26 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1073,7 +1073,11 @@ The algorithm for adjusting the SLA variable is as follows: Example command: ```bash -python vllm/benchmarks/sweep/plot.py benchmarks/results/ --curve-by api_server_count,max_num_batched_tokens --file-by random_input_len,random_output_len +python vllm/benchmarks/sweep/plot.py benchmarks/results/ \ + --var-x max_concurrency \ + --max-x 1024 \ + --curve-by api_server_count,max_num_batched_tokens \ + --file-by random_input_len,random_output_len ``` !!! tip From ad14a53c4d3a11da55aca756c37ab3921815812e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 04:32:22 +0000 Subject: [PATCH 06/48] Add title Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 5905807baa61..fc4a94d26ee1 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -18,6 +18,7 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]: def _plot_fig( fig_path: Path, + fig_title: str, fig_data: list[dict[str, object]], curve_by: list[str], *, @@ -64,6 +65,8 @@ def _plot_fig( markers=True, ) + ax.set_title(fig_title) + if log_y: ax.set_yscale("log") @@ -101,6 +104,8 @@ def plot( all_data, key=lambda item: tuple((k, str(item[k])) for k in fig_by), ): + fig_group = tuple(fig_group) + fig_path = output_dir / ( "-".join( ( @@ -112,9 +117,13 @@ def plot( .replace("..", "__") # Sanitize + ".png" ) + fig_title = ( + ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)" + ) _plot_fig( fig_path, + fig_title, fig_data, curve_by, var_x=var_x, From 08fab86ad8bd4809f0d90faab428619e54a92c80 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 04:41:39 +0000 Subject: [PATCH 07/48] Support file prefix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index fc4a94d26ee1..bbdc534fcfaf 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -92,6 +92,7 @@ def plot( max_x: float | None, bin_x: float | None, log_y: bool, + file_prefix: str, dry_run: bool, ): all_data = [ @@ -107,7 +108,8 @@ def plot( fig_group = tuple(fig_group) fig_path = output_dir / ( - "-".join( + file_prefix + + "-".join( ( "FIGURE", *(f"{k}={v}" for k, v in fig_group), @@ -190,6 +192,13 @@ def main(): action="store_true", help="Use logarithmic scaling for the y-axis.", ) + parser.add_argument( + "--file-prefix", + type=str, + default="", + help="If set, prepends this to the filename of the saved figures to " + "distinguish them from other runs of this script.", + ) parser.add_argument( "--dry-run", action="store_true", @@ -210,6 +219,7 @@ def main(): max_x=args.max_x, bin_x=args.bin_x, log_y=args.log_y, + file_prefix=args.file_prefix, dry_run=args.dry_run, ) From bc04f307b0d78accb92f7d583171fd13647a1889 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 04:42:22 +0000 Subject: [PATCH 08/48] Separate Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index bbdc534fcfaf..51a8305ac830 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -109,6 +109,7 @@ def plot( fig_path = output_dir / ( file_prefix + + ("--" if file_prefix else "") + "-".join( ( "FIGURE", From 1e13493a83eb324db92b3ef9502477c7675d4cc7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 04:44:13 +0000 Subject: [PATCH 09/48] Improve separation Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 2 +- vllm/benchmarks/sweep/serve.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 51a8305ac830..0291be80de38 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -112,7 +112,7 @@ def plot( + ("--" if file_prefix else "") + "-".join( ( - "FIGURE", + "FIGURE-", *(f"{k}={v}" for k, v in fig_group), ) ) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index 960c229c4999..5751e326578b 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -108,9 +108,9 @@ def _get_comb_base_path( ): return output_dir / "-".join( ( - "SERVE", + "SERVE-", serve_comb.as_text(sep="-"), - "BENCH", + "BENCH-", bench_comb.as_text(sep="-"), ) ).replace("/", "_").replace("..", "__") # Sanitize @@ -228,9 +228,9 @@ def _get_sla_base_path( ): return output_dir / "-".join( ( - "SERVE", + "SERVE-", serve_comb.as_text(sep="-"), - "BENCH", + "BENCH-", bench_comb.as_text(sep="-"), ) ).replace("/", "_").replace("..", "__") # Sanitize @@ -244,7 +244,7 @@ def _get_sla_iter_path( ): if sla_value is None: prefix = sla_comb.as_text(sep="-") - return base_path / f"SLA-{prefix}.json" + return base_path / f"SLA--{prefix}.json" return base_path / f"{sla_variable}={sla_value}" From c848b10945018f5a0d074afb59825f1e2f4c4351 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 08:58:05 +0000 Subject: [PATCH 10/48] Set by directory, not prefix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 0291be80de38..ff72802431cc 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -84,6 +84,7 @@ def _plot_fig( def plot( output_dir: Path, + fig_dir: Path, fig_by: list[str], curve_by: list[str], *, @@ -92,7 +93,6 @@ def plot( max_x: float | None, bin_x: float | None, log_y: bool, - file_prefix: str, dry_run: bool, ): all_data = [ @@ -107,10 +107,8 @@ def plot( ): fig_group = tuple(fig_group) - fig_path = output_dir / ( - file_prefix - + ("--" if file_prefix else "") - + "-".join( + fig_path = fig_dir / ( + "-".join( ( "FIGURE-", *(f"{k}={v}" for k, v in fig_group), @@ -146,8 +144,15 @@ def main(): "OUTPUT_DIR", type=str, default="results", - help="The directory containing the results to plot. " - "Figures will be saved to the same directory.", + help="The directory containing the results to plot, " + "i.e., the `--output-dir` argument to the parameter sweep script.", + ) + parser.add_argument( + "--fig-dir", + type=str, + default=None, + help="The directory to save the figures. " + "By default, this is set to `OUTPUT_DIR`.", ) parser.add_argument( "--curve-by", @@ -193,13 +198,6 @@ def main(): action="store_true", help="Use logarithmic scaling for the y-axis.", ) - parser.add_argument( - "--file-prefix", - type=str, - default="", - help="If set, prepends this to the filename of the saved figures to " - "distinguish them from other runs of this script.", - ) parser.add_argument( "--dry-run", action="store_true", @@ -213,6 +211,7 @@ def main(): plot( output_dir=Path(args.OUTPUT_DIR), + fig_dir=Path(args.fig_dir or args.OUTPUT_DIR), fig_by=fig_by, curve_by=curve_by, var_x=args.var_x, @@ -220,7 +219,6 @@ def main(): max_x=args.max_x, bin_x=args.bin_x, log_y=args.log_y, - file_prefix=args.file_prefix, dry_run=args.dry_run, ) From f7f36f2767d1b04aa4aec11ad7c0f77940e2a37d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 09:25:04 +0000 Subject: [PATCH 11/48] Fix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index ff72802431cc..4b38ab49e3ca 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -101,6 +101,8 @@ def plot( for run_data in _json_load_bytes(path) ] + fig_dir.mkdir(parents=True, exist_ok=True) + for fig_group, fig_data in full_groupby( all_data, key=lambda item: tuple((k, str(item[k])) for k in fig_by), From c6cb78a2b05ba401f06d7ac4b92c0cf2bb3512e5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 09:32:18 +0000 Subject: [PATCH 12/48] Plot in parallel Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 71 ++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 4b38ab49e3ca..759705bbc2cb 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json +from concurrent.futures import Future, ProcessPoolExecutor from pathlib import Path import matplotlib.pyplot as plt @@ -103,39 +104,49 @@ def plot( fig_dir.mkdir(parents=True, exist_ok=True) - for fig_group, fig_data in full_groupby( - all_data, - key=lambda item: tuple((k, str(item[k])) for k in fig_by), - ): - fig_group = tuple(fig_group) - - fig_path = fig_dir / ( - "-".join( - ( - "FIGURE-", - *(f"{k}={v}" for k, v in fig_group), + with ProcessPoolExecutor() as pool: + tasks = list[Future[None]]() + + for fig_group, fig_data in full_groupby( + all_data, + key=lambda item: tuple((k, str(item[k])) for k in fig_by), + ): + fig_group = tuple(fig_group) + + fig_path = fig_dir / ( + "-".join( + ( + "FIGURE-", + *(f"{k}={v}" for k, v in fig_group), + ) ) + .replace("/", "_") + .replace("..", "__") # Sanitize + + ".png" + ) + fig_title = ( + ", ".join(f"{k}={v}" for k, v in fig_group) + if fig_group + else "(All data)" ) - .replace("/", "_") - .replace("..", "__") # Sanitize - + ".png" - ) - fig_title = ( - ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)" - ) - _plot_fig( - fig_path, - fig_title, - fig_data, - curve_by, - var_x=var_x, - var_y=var_y, - max_x=max_x, - bin_x=bin_x, - log_y=log_y, - dry_run=dry_run, - ) + task = pool.submit( + _plot_fig, + fig_path, + fig_title, + fig_data, + curve_by, + var_x=var_x, + var_y=var_y, + max_x=max_x, + bin_x=bin_x, + log_y=log_y, + dry_run=dry_run, + ) + tasks.append(task) + + for f in tasks: + f.result() def main(): From 4cc5e90a91556426862847c377c05854e9cb1c9f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 09:39:30 +0000 Subject: [PATCH 13/48] Clean up Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 96 +++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 37 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 759705bbc2cb..d268d58b6d87 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -2,7 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json -from concurrent.futures import Future, ProcessPoolExecutor +from collections.abc import Iterable +from concurrent.futures import ProcessPoolExecutor +from functools import partial from pathlib import Path import matplotlib.pyplot as plt @@ -83,6 +85,51 @@ def _plot_fig( print("[END FIGURE]") +def _plot_fig_by_group( + fig_dir: Path, + fig_group_data: tuple[Iterable[tuple[str, str]], list[dict[str, object]]], + curve_by: list[str], + *, + var_x: str, + var_y: str, + max_x: float | None, + bin_x: float | None, + log_y: bool, + dry_run: bool, +): + fig_group, fig_data = fig_group_data + + fig_group = tuple(fig_group) + + fig_path = fig_dir / ( + "-".join( + ( + "FIGURE-", + *(f"{k}={v}" for k, v in fig_group), + ) + ) + .replace("/", "_") + .replace("..", "__") # Sanitize + + ".png" + ) + fig_title = ( + ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)" + ) + + return _plot_fig( + fig_path, + fig_title, + fig_data, + curve_by, + var_x=var_x, + var_y=var_y, + max_x=max_x, + bin_x=bin_x, + log_y=log_y, + dry_run=dry_run, + ) + + def plot( output_dir: Path, fig_dir: Path, @@ -105,48 +152,23 @@ def plot( fig_dir.mkdir(parents=True, exist_ok=True) with ProcessPoolExecutor() as pool: - tasks = list[Future[None]]() - - for fig_group, fig_data in full_groupby( - all_data, - key=lambda item: tuple((k, str(item[k])) for k in fig_by), - ): - fig_group = tuple(fig_group) - - fig_path = fig_dir / ( - "-".join( - ( - "FIGURE-", - *(f"{k}={v}" for k, v in fig_group), - ) - ) - .replace("/", "_") - .replace("..", "__") # Sanitize - + ".png" - ) - fig_title = ( - ", ".join(f"{k}={v}" for k, v in fig_group) - if fig_group - else "(All data)" - ) - - task = pool.submit( - _plot_fig, - fig_path, - fig_title, - fig_data, - curve_by, + pool.map( + partial( + _plot_fig_by_group, + fig_dir, + curve_by=curve_by, var_x=var_x, var_y=var_y, max_x=max_x, bin_x=bin_x, log_y=log_y, dry_run=dry_run, - ) - tasks.append(task) - - for f in tasks: - f.result() + ), + full_groupby( + all_data, + key=lambda item: tuple((k, str(item[k])) for k in fig_by), + ), + ) def main(): From 4af1e1af74d50a61ab4cf32c6d7565b0a071134e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 09:45:20 +0000 Subject: [PATCH 14/48] Don't silently fail Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index d268d58b6d87..bdc0bbefea76 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -35,11 +35,17 @@ def _plot_fig( print("[BEGIN FIGURE]") print(f"Output file: {fig_path}") - if dry_run: - print("[END FIGURE]") - return - df = pd.DataFrame.from_records(fig_data) + if var_x not in df.columns: + raise ValueError( + f"Cannot find {var_x=!r} in parameter sweep results. " + f"Available variables: {df.columns}" + ) + if var_y not in df.columns: + raise ValueError( + f"Cannot find {var_y=!r} in parameter sweep results. " + f"Available variables: {df.columns}" + ) if max_x is not None: df = df[df[var_x] <= max_x] @@ -47,6 +53,10 @@ def _plot_fig( if bin_x is not None: df[var_x] = df[var_x] // bin_x * bin_x + if dry_run: + print("[END FIGURE]") + return + if len(curve_by) <= 3: hue, style, size, *_ = (*curve_by, None, None) ax = sns.lineplot( @@ -152,7 +162,7 @@ def plot( fig_dir.mkdir(parents=True, exist_ok=True) with ProcessPoolExecutor() as pool: - pool.map( + out = pool.map( partial( _plot_fig_by_group, fig_dir, @@ -170,6 +180,9 @@ def plot( ), ) + # Collect the results + all(out) + def main(): parser = argparse.ArgumentParser( From 7d82607ea758009392a82452f8652b9ab2e1427b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 09:45:53 +0000 Subject: [PATCH 15/48] Pretty Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index bdc0bbefea76..b98dbf801214 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -39,12 +39,12 @@ def _plot_fig( if var_x not in df.columns: raise ValueError( f"Cannot find {var_x=!r} in parameter sweep results. " - f"Available variables: {df.columns}" + f"Available variables: {df.columns.tolist()}" ) if var_y not in df.columns: raise ValueError( f"Cannot find {var_y=!r} in parameter sweep results. " - f"Available variables: {df.columns}" + f"Available variables: {df.columns.tolist()}" ) if max_x is not None: From 8150f441b075d0166958ce297189e61992ebf192 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 12:22:39 +0000 Subject: [PATCH 16/48] Fix nested Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/param_sweep.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py index bddf0b98ae29..90006e02ba6a 100644 --- a/vllm/benchmarks/sweep/param_sweep.py +++ b/vllm/benchmarks/sweep/param_sweep.py @@ -40,6 +40,14 @@ def __or__(self, other: dict[str, Any]): # In JSON, we prefer "_" def _iter_param_key_candidates(self, param_key: str): + # Inner config arguments are not converted by the CLI + if "." in param_key: + prefix, rest = param_key.split(".", 1) + for prefix_candidate in self._iter_param_key_candidates(prefix): + yield prefix_candidate + "." + rest + + return + yield param_key yield param_key.replace("-", "_") yield param_key.replace("_", "-") From 6ace5b2bae67f06d0893ace1164fab4889f46e68 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 12:55:38 +0000 Subject: [PATCH 17/48] Raise error if no data found Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index b98dbf801214..34284b0ec8bd 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -159,6 +159,9 @@ def plot( for run_data in _json_load_bytes(path) ] + if not all_data: + raise ValueError(f"Did not find any parameter sweep results under {output_dir}") + fig_dir.mkdir(parents=True, exist_ok=True) with ProcessPoolExecutor() as pool: From b3eb7cdb74f44b9995536de06287efeb06e58872 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 13:06:49 +0000 Subject: [PATCH 18/48] Show the problematic data item Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 34284b0ec8bd..66093cb32caf 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -19,6 +19,13 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]: return json.load(f) +def _get_metric(run_data: dict[str, object], metric_key: str): + try: + return run_data[metric_key] + except KeyError as exc: + raise ValueError(f"Cannot find metric {metric_key!r} in {run_data=}") from exc + + def _plot_fig( fig_path: Path, fig_title: str, @@ -179,7 +186,7 @@ def plot( ), full_groupby( all_data, - key=lambda item: tuple((k, str(item[k])) for k in fig_by), + key=lambda item: tuple((k, str(_get_metric(item, k))) for k in fig_by), ), ) From 8154e084b50277c5fa453031037e0a92418ac2d3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 13:19:06 +0000 Subject: [PATCH 19/48] Convert to string first Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 66093cb32caf..4783a34e570d 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -76,7 +76,7 @@ def _plot_fig( markers=True, ) else: - df["params"] = df[list(curve_by)].agg("-".join, axis=1) + df["params"] = df[list(curve_by)].astype(str).agg("-".join, axis=1) ax = sns.lineplot( df, x=var_x, From d9fcb097e7ed5a0bb5fc70c41b490a291daba710 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 13:30:55 +0000 Subject: [PATCH 20/48] Be more clear Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 4783a34e570d..b722d2b23ffb 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -76,7 +76,11 @@ def _plot_fig( markers=True, ) else: - df["params"] = df[list(curve_by)].astype(str).agg("-".join, axis=1) + df["params"] = pd.concat( + [k + "=" + df[k].astype(str) for k in curve_by], + axis=1, + ).agg("-".join, axis=1) + ax = sns.lineplot( df, x=var_x, From 9c0e9faf3359c39281bf37a05dc3963ac646fe07 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:09:12 +0000 Subject: [PATCH 21/48] Use seaborn grid Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 244 +++++++++++++++++++++------------- 1 file changed, 151 insertions(+), 93 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index b722d2b23ffb..f2dfb6ee1c73 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -2,14 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json -from collections.abc import Iterable from concurrent.futures import ProcessPoolExecutor from functools import partial from pathlib import Path +from types import TracebackType import matplotlib.pyplot as plt import pandas as pd import seaborn as sns +from typing_extensions import Self from vllm.utils.collections import full_groupby @@ -26,10 +27,51 @@ def _get_metric(run_data: dict[str, object], metric_key: str): raise ValueError(f"Cannot find metric {metric_key!r} in {run_data=}") from exc +def _get_group(run_data: dict[str, object], group_keys: list[str]): + return tuple((k, str(_get_metric(run_data, k))) for k in group_keys) + + +def _get_fig_path( + fig_dir: Path, + group: tuple[tuple[str, str], ...], +): + return fig_dir / ( + "-".join( + ( + "FIGURE-", + *(f"{k}={v}" for k, v in group), + ) + ) + .replace("/", "_") + .replace("..", "__") # Sanitize + + ".png" + ) + + +def _get_fig_title(group: tuple[tuple[str, str], ...]): + return ", ".join(f"{k}={v}" for k, v in group) if group else "(All)" + + +class DummyExecutor: + map = map + + def __enter__(self) -> Self: + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + return None + + def _plot_fig( - fig_path: Path, - fig_title: str, - fig_data: list[dict[str, object]], + fig_dir: Path, + fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]], + row_by: list[str], + col_by: list[str], curve_by: list[str], *, var_x: str, @@ -39,10 +81,31 @@ def _plot_fig( log_y: bool, dry_run: bool, ): + fig_group, fig_data = fig_group_data + + row_groups = full_groupby( + fig_data, + key=lambda item: _get_group(item, row_by), + ) + num_rows = len(row_groups) + num_cols = max( + len(full_groupby(row_data, key=lambda item: _get_group(item, col_by))) + for _, row_data in row_groups + ) + + fig_path = _get_fig_path(fig_dir, fig_group) + print("[BEGIN FIGURE]") + print(f"Group: {dict(fig_group)}") + print(f"Grid: {num_rows} rows x {num_cols} cols") print(f"Output file: {fig_path}") + if dry_run: + print("[END FIGURE]") + return + df = pd.DataFrame.from_records(fig_data) + if var_x not in df.columns: raise ValueError( f"Cannot find {var_x=!r} in parameter sweep results. " @@ -60,14 +123,35 @@ def _plot_fig( if bin_x is not None: df[var_x] = df[var_x] // bin_x * bin_x - if dry_run: - print("[END FIGURE]") - return + df["row_group"] = ( + pd.concat( + [k + "=" + df[k].astype(str) for k in row_by], + axis=1, + ).agg("-".join, axis=1) + if row_by + else "(All)" + ) + + df["col_group"] = ( + pd.concat( + [k + "=" + df[k].astype(str) for k in col_by], + axis=1, + ).agg("-".join, axis=1) + if col_by + else "(All)" + ) + + g = sns.FacetGrid(df, row="row_group", col="col_group") + + g.set_titles("{row_name},{col_name}") + + if log_y: + g.set(yscale="log") if len(curve_by) <= 3: hue, style, size, *_ = (*curve_by, None, None) - ax = sns.lineplot( - df, + g.map_dataframe( + sns.lineplot, x=var_x, y=var_y, hue=hue, @@ -76,85 +160,37 @@ def _plot_fig( markers=True, ) else: - df["params"] = pd.concat( - [k + "=" + df[k].astype(str) for k in curve_by], - axis=1, - ).agg("-".join, axis=1) + df["curve_group"] = ( + pd.concat( + [k + "=" + df[k].astype(str) for k in curve_by], + axis=1, + ).agg("-".join, axis=1) + if curve_by + else "(All)" + ) - ax = sns.lineplot( - df, + g.map_dataframe( + sns.lineplot, x=var_x, y=var_y, - hue="params", + hue="curve_group", markers=True, ) - ax.set_title(fig_title) - - if log_y: - ax.set_yscale("log") - - sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) - - fig = ax.get_figure() - assert fig is not None + g.add_legend() - fig.tight_layout() - fig.savefig(fig_path) - plt.close(fig) + g.savefig(fig_path) + plt.close(g.figure) print("[END FIGURE]") -def _plot_fig_by_group( - fig_dir: Path, - fig_group_data: tuple[Iterable[tuple[str, str]], list[dict[str, object]]], - curve_by: list[str], - *, - var_x: str, - var_y: str, - max_x: float | None, - bin_x: float | None, - log_y: bool, - dry_run: bool, -): - fig_group, fig_data = fig_group_data - - fig_group = tuple(fig_group) - - fig_path = fig_dir / ( - "-".join( - ( - "FIGURE-", - *(f"{k}={v}" for k, v in fig_group), - ) - ) - .replace("/", "_") - .replace("..", "__") # Sanitize - + ".png" - ) - fig_title = ( - ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)" - ) - - return _plot_fig( - fig_path, - fig_title, - fig_data, - curve_by, - var_x=var_x, - var_y=var_y, - max_x=max_x, - bin_x=bin_x, - log_y=log_y, - dry_run=dry_run, - ) - - def plot( output_dir: Path, fig_dir: Path, fig_by: list[str], + row_by: list[str], + col_by: list[str], curve_by: list[str], *, var_x: str, @@ -175,27 +211,31 @@ def plot( fig_dir.mkdir(parents=True, exist_ok=True) - with ProcessPoolExecutor() as pool: - out = pool.map( - partial( - _plot_fig_by_group, - fig_dir, - curve_by=curve_by, - var_x=var_x, - var_y=var_y, - max_x=max_x, - bin_x=bin_x, - log_y=log_y, - dry_run=dry_run, - ), - full_groupby( - all_data, - key=lambda item: tuple((k, str(_get_metric(item, k))) for k in fig_by), - ), - ) + fig_groups = full_groupby( + all_data, + key=lambda item: _get_group(item, fig_by), + ) - # Collect the results - all(out) + with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor: + # Resolve the iterable to ensure that the workers are run + all( + executor.map( + partial( + _plot_fig, + fig_dir, + row_by=row_by, + col_by=col_by, + curve_by=curve_by, + var_x=var_x, + var_y=var_y, + max_x=max_x, + bin_x=bin_x, + log_y=log_y, + dry_run=dry_run, + ), + fig_groups, + ) + ) def main(): @@ -223,6 +263,20 @@ def main(): help="A comma-separated list of variables, such that a separate curve " "is created for each combination of these variables.", ) + parser.add_argument( + "--col-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate column " + "is created for each combination of these variables.", + ) + parser.add_argument( + "--row-by", + type=str, + default="", + help="A comma-separated list of variables, such that a separate row " + "is created for each combination of these variables.", + ) parser.add_argument( "--fig-by", type=str, @@ -269,12 +323,16 @@ def main(): args = parser.parse_args() curve_by = [] if not args.curve_by else args.curve_by.split(",") + row_by = [] if not args.row_by else args.row_by.split(",") + col_by = [] if not args.col_by else args.col_by.split(",") fig_by = [] if not args.fig_by else args.fig_by.split(",") plot( output_dir=Path(args.OUTPUT_DIR), fig_dir=Path(args.fig_dir or args.OUTPUT_DIR), fig_by=fig_by, + row_by=row_by, + col_by=col_by, curve_by=curve_by, var_x=args.var_x, var_y=args.var_y, From f1810ccc5232c933aae938c121db530e0dc991bb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:15:54 +0000 Subject: [PATCH 22/48] Clean up Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index f2dfb6ee1c73..f722c19d95a6 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -143,13 +143,20 @@ def _plot_fig( g = sns.FacetGrid(df, row="row_group", col="col_group") - g.set_titles("{row_name},{col_name}") + if row_by and col_by: + g.set_titles("{row_name},{col_name}") + elif row_by: + g.set_titles("{row_name}") + elif col_by: + g.set_titles("{col_name}") + else: + g.set_titles("") if log_y: g.set(yscale="log") if len(curve_by) <= 3: - hue, style, size, *_ = (*curve_by, None, None) + hue, style, size, *_ = (*curve_by, None, None, None) g.map_dataframe( sns.lineplot, x=var_x, @@ -259,7 +266,7 @@ def main(): parser.add_argument( "--curve-by", type=str, - required=True, + default=None, help="A comma-separated list of variables, such that a separate curve " "is created for each combination of these variables.", ) From dac464b6665fbc054baf928513d9c612b3e65162 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:17:37 +0000 Subject: [PATCH 23/48] Clean Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index f722c19d95a6..192709abe015 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -127,7 +127,7 @@ def _plot_fig( pd.concat( [k + "=" + df[k].astype(str) for k in row_by], axis=1, - ).agg("-".join, axis=1) + ).agg("\n".join, axis=1) if row_by else "(All)" ) @@ -136,7 +136,7 @@ def _plot_fig( pd.concat( [k + "=" + df[k].astype(str) for k in col_by], axis=1, - ).agg("-".join, axis=1) + ).agg("\n".join, axis=1) if col_by else "(All)" ) @@ -144,7 +144,7 @@ def _plot_fig( g = sns.FacetGrid(df, row="row_group", col="col_group") if row_by and col_by: - g.set_titles("{row_name},{col_name}") + g.set_titles("{row_name}\n{col_name}") elif row_by: g.set_titles("{row_name}") elif col_by: @@ -157,6 +157,7 @@ def _plot_fig( if len(curve_by) <= 3: hue, style, size, *_ = (*curve_by, None, None, None) + g.map_dataframe( sns.lineplot, x=var_x, @@ -171,7 +172,7 @@ def _plot_fig( pd.concat( [k + "=" + df[k].astype(str) for k in curve_by], axis=1, - ).agg("-".join, axis=1) + ).agg("\n".join, axis=1) if curve_by else "(All)" ) From 73c911b28be6b803a65c4c463c478a5f433666b7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:19:59 +0000 Subject: [PATCH 24/48] TODO Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 192709abe015..9ed3e4ca09e5 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -31,10 +31,7 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]): return tuple((k, str(_get_metric(run_data, k))) for k in group_keys) -def _get_fig_path( - fig_dir: Path, - group: tuple[tuple[str, str], ...], -): +def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]): return fig_dir / ( "-".join( ( @@ -48,10 +45,6 @@ def _get_fig_path( ) -def _get_fig_title(group: tuple[tuple[str, str], ...]): - return ", ".join(f"{k}={v}" for k, v in group) if group else "(All)" - - class DummyExecutor: map = map @@ -117,6 +110,8 @@ def _plot_fig( f"Available variables: {df.columns.tolist()}" ) + # TODO: Support syntax + # e.g. request_rate<=1024%2 means max of 1024 and bin size of 2 if max_x is not None: df = df[df[var_x] <= max_x] From eef9c40907382f5e94268a1e6a79c24b14f717d3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:24:20 +0000 Subject: [PATCH 25/48] Clean Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 2 +- vllm/benchmarks/sweep/serve.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 9ed3e4ca09e5..ffc15af46021 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -35,7 +35,7 @@ def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]): return fig_dir / ( "-".join( ( - "FIGURE-", + "FIGURE" + ("-" if group else ""), *(f"{k}={v}" for k, v in group), ) ) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index 5751e326578b..5599a5dbc78b 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -108,9 +108,9 @@ def _get_comb_base_path( ): return output_dir / "-".join( ( - "SERVE-", + "SERVE" + ("-" if serve_comb else ""), serve_comb.as_text(sep="-"), - "BENCH-", + "BENCH" + ("-" if bench_comb else ""), bench_comb.as_text(sep="-"), ) ).replace("/", "_").replace("..", "__") # Sanitize @@ -228,9 +228,9 @@ def _get_sla_base_path( ): return output_dir / "-".join( ( - "SERVE-", + "SERVE" + ("-" if serve_comb else ""), serve_comb.as_text(sep="-"), - "BENCH-", + "BENCH" + ("-" if bench_comb else ""), bench_comb.as_text(sep="-"), ) ).replace("/", "_").replace("..", "__") # Sanitize From aa9615149480f07cb64d69d549b11bfe858ad3fb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:47:35 +0000 Subject: [PATCH 26/48] Generalized filter and binning Signed-off-by: DarkLight1337 --- docs/contributing/benchmarks.md | 5 +- vllm/benchmarks/sweep/plot.py | 201 +++++++++++++++++++++++++++----- 2 files changed, 172 insertions(+), 34 deletions(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index ae8582113f26..6ff475422e9f 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1075,9 +1075,10 @@ Example command: ```bash python vllm/benchmarks/sweep/plot.py benchmarks/results/ \ --var-x max_concurrency \ - --max-x 1024 \ --curve-by api_server_count,max_num_batched_tokens \ - --file-by random_input_len,random_output_len + --row-by random_input_len \ + --col-by random_output_len \ + --filter-by 'max_concurrency<=1024' ``` !!! tip diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index ffc15af46021..53887c82ee49 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json +from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor from functools import partial from pathlib import Path @@ -10,11 +11,130 @@ import matplotlib.pyplot as plt import pandas as pd import seaborn as sns -from typing_extensions import Self +from typing_extensions import Self, override from vllm.utils.collections import full_groupby +class PlotFilterBase(ABC): + @classmethod + def parse_str(cls, s: str): + for op_key in PLOT_FILTERS: + if op_key in s: + key, value = s.split(op_key) + return PLOT_FILTERS[op_key](key, float(value.removeprefix(op_key))) + else: + raise ValueError( + f"Invalid operator for plot filter '{s}'. " + f"Valid operators are: {set(PLOT_FILTERS)}", + ) + + def __init__(self, var: str, target: float) -> None: + super().__init__() + + self.var = var + self.target = target + + @abstractmethod + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + """Applies this filter to a DataFrame.""" + raise NotImplementedError + + +class PlotLessThan(PlotFilterBase): + @override + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + return df[df[self.var] < self.target] + + +class PlotLessThanOrEqual(PlotFilterBase): + @override + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + return df[df[self.var] <= self.target] + + +class PlotGreaterThan(PlotFilterBase): + @override + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + return df[df[self.var] > self.target] + + +class PlotGreaterThanOrEqual(PlotFilterBase): + @override + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + return df[df[self.var] >= self.target] + + +# NOTE: The ordering is important! Match longer op_keys first +PLOT_FILTERS: dict[str, type[PlotFilterBase]] = { + "<=": PlotLessThanOrEqual, + ">=": PlotGreaterThanOrEqual, + "<": PlotLessThan, + ">": PlotGreaterThan, +} + + +class PlotFilters(list[PlotFilterBase]): + @classmethod + def parse_str(cls, s: str): + if not s: + return cls() + + return cls(PlotFilterBase.parse_str(e) for e in s.split(",")) + + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + for item in self: + df = item.apply(df) + + return df + + +class PlotBinner: + @classmethod + def parse_str(cls, s: str): + for op_key in PLOT_BINNERS: + if op_key in s: + key, value = s.split(op_key) + return PLOT_BINNERS[op_key](key, float(value.removeprefix(op_key))) + else: + raise ValueError( + f"Invalid operator for plot binner '{s}'. " + f"Valid operators are: {set(PLOT_BINNERS)}", + ) + + def __init__(self, var: str, bin_size: float) -> None: + super().__init__() + + self.var = var + self.bin_size = bin_size + + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + """Applies this binner to a DataFrame.""" + df = df.copy() + df[self.var] = df[self.var] // self.bin_size * self.bin_size + return df + + +PLOT_BINNERS: dict[str, type[PlotBinner]] = { + "@": PlotBinner, +} + + +class PlotBinners(list[PlotBinner]): + @classmethod + def parse_str(cls, s: str): + if not s: + return cls() + + return cls(PlotBinner.parse_str(e) for e in s.split(",")) + + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + for item in self: + df = item.apply(df) + + return df + + def _json_load_bytes(path: Path) -> list[dict[str, object]]: with path.open("rb") as f: return json.load(f) @@ -69,9 +189,10 @@ def _plot_fig( *, var_x: str, var_y: str, - max_x: float | None, - bin_x: float | None, - log_y: bool, + filter_by: PlotFilters, + bin_by: PlotBinners, + scale_x: str | None, + scale_y: str | None, dry_run: bool, ): fig_group, fig_data = fig_group_data @@ -110,13 +231,8 @@ def _plot_fig( f"Available variables: {df.columns.tolist()}" ) - # TODO: Support syntax - # e.g. request_rate<=1024%2 means max of 1024 and bin size of 2 - if max_x is not None: - df = df[df[var_x] <= max_x] - - if bin_x is not None: - df[var_x] = df[var_x] // bin_x * bin_x + df = filter_by.apply(df) + df = bin_by.apply(df) df["row_group"] = ( pd.concat( @@ -147,8 +263,10 @@ def _plot_fig( else: g.set_titles("") - if log_y: - g.set(yscale="log") + if scale_x: + g.set(xscale=scale_x) + if scale_y: + g.set(yscale=scale_y) if len(curve_by) <= 3: hue, style, size, *_ = (*curve_by, None, None, None) @@ -198,9 +316,10 @@ def plot( *, var_x: str, var_y: str, - max_x: float | None, - bin_x: float | None, - log_y: bool, + filter_by: PlotFilters, + bin_by: PlotBinners, + scale_x: str | None, + scale_y: str | None, dry_run: bool, ): all_data = [ @@ -231,9 +350,10 @@ def plot( curve_by=curve_by, var_x=var_x, var_y=var_y, - max_x=max_x, - bin_x=bin_x, - log_y=log_y, + filter_by=filter_by, + bin_by=bin_by, + scale_x=scale_x, + scale_y=scale_y, dry_run=dry_run, ), fig_groups, @@ -300,22 +420,38 @@ def main(): help="The variable for the y-axis", ) parser.add_argument( - "--max-x", - type=float, - default=None, - help="The maximum value to plot for the x-axis.", + "--filter-by", + type=str, + default="", + help="A comma-separated list of statements indicating values to filter by. " + "This is useful to remove outliers. " + "Example: `max_concurrency<1000,max_num_batched_tokens<=4096` means " + "plot only the points where `max_concurrency` is less than 1000 and " + "`max_num_batched_tokens` is no greater than 4096.", ) parser.add_argument( - "--bin-x", - type=float, + "--bin-by", + type=str, + default="", + help="A comma-separated list of statements indicating values to bin by. " + "This is useful to avoid plotting points that are too close together. " + "Example: `request_throughput%1` means " + "use a bin size of 1 for the `request_throughput` variable.", + ) + parser.add_argument( + "--scale-x", + type=str, default=None, - help="Group together points with x-axis values in the same bin " - "to reduce noise.", + help="The scale to use for the x-axis. " + "Currently only accepts string values such as 'log' and 'sqrt'. " + "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", ) parser.add_argument( - "--log-y", + "--scale-y", action="store_true", - help="Use logarithmic scaling for the y-axis.", + help="The scale to use for the y-axis. " + "Currently only accepts string values such as 'log' and 'sqrt'. " + "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", ) parser.add_argument( "--dry-run", @@ -339,9 +475,10 @@ def main(): curve_by=curve_by, var_x=args.var_x, var_y=args.var_y, - max_x=args.max_x, - bin_x=args.bin_x, - log_y=args.log_y, + filter_by=PlotFilters.parse_str(args.filter_by), + bin_by=PlotBinners.parse_str(args.bin_by), + scale_x=args.scale_x, + scale_y=args.scale_y, dry_run=args.dry_run, ) From 0b984965a17d36c46ab480e75c699ca01d2e51fe Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:49:08 +0000 Subject: [PATCH 27/48] Remove old script Signed-off-by: DarkLight1337 --- vllm/benchmarks/serve_multi.py | 1157 -------------------------------- 1 file changed, 1157 deletions(-) delete mode 100644 vllm/benchmarks/serve_multi.py diff --git a/vllm/benchmarks/serve_multi.py b/vllm/benchmarks/serve_multi.py deleted file mode 100644 index e8524473aedd..000000000000 --- a/vllm/benchmarks/serve_multi.py +++ /dev/null @@ -1,1157 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import contextlib -import json -import math -import os -import shlex -import signal -import subprocess -from abc import ABC, abstractmethod -from datetime import datetime -from pathlib import Path -from typing import Literal, get_args - -import pandas as pd -import requests -import seaborn as sns -from typing_extensions import assert_never, override - -_BAD_PARAMS_TYPE_MSG = ( - "The parameters to vary should be expressed as a JSON list of dictionaries." -) - - -def _parse_params(params: list[dict[str, object]]): - if not isinstance(params, list): - raise TypeError(f"{_BAD_PARAMS_TYPE_MSG} Found JSON type {type(params)}") - - for comb in params: - if not isinstance(comb, dict): - raise TypeError(f"{_BAD_PARAMS_TYPE_MSG} Found item type {type(comb)}") - - return params - - -class SLACriterionBase(ABC): - def __init__(self, target: float) -> None: - super().__init__() - - self.target = target - - @abstractmethod - def validate(self, actual: float) -> bool: - """Return `True` if this criterion is met; otherwise `False`.""" - raise NotImplementedError - - @abstractmethod - def format_cond(self, lhs: str) -> str: - raise NotImplementedError - - def print_and_validate( - self, - metrics: dict[str, float], - metrics_key: str, - ) -> bool: - metric = metrics[metrics_key] - result = self.validate(metric) - - cond = self.format_cond(f"{metrics_key} = {metric:.2f}") - print(f"Validating SLA: {cond} | " + ("PASSED" if result else "FAILED")) - - return result - - -class SLALessThan(SLACriterionBase): - @override - def validate(self, actual: float) -> bool: - return actual < self.target - - @override - def format_cond(self, lhs: str) -> str: - return f"{lhs}<{self.target:.2f}" - - -class SLALessThanOrEqual(SLACriterionBase): - @override - def validate(self, actual: float) -> bool: - return actual <= self.target - - @override - def format_cond(self, lhs: str) -> str: - return f"{lhs}<={self.target:.2f}" - - -class SLAGreaterThan(SLACriterionBase): - @override - def validate(self, actual: float) -> bool: - return actual > self.target - - @override - def format_cond(self, lhs: str) -> str: - return f"{lhs}>{self.target:.2f}" - - -class SLAGreaterThanOrEqual(SLACriterionBase): - @override - def validate(self, actual: float) -> bool: - return actual >= self.target - - @override - def format_cond(self, lhs: str) -> str: - return f"{lhs}>={self.target:.2f}" - - -# NOTE: The ordering is important! Match longer op_keys first -SLA_CRITERIA: dict[str, type[SLACriterionBase]] = { - "<=": SLALessThanOrEqual, - ">=": SLAGreaterThanOrEqual, - "<": SLALessThan, - ">": SLAGreaterThan, -} - - -def _parse_sla_item(sla_item: dict[str, str]): - sla_criteria: dict[str, SLACriterionBase] = {} - - for metric_key, metric_value in sla_item.items(): - for op_key in SLA_CRITERIA: - if metric_value.startswith(op_key): - sla_criteria[metric_key] = SLA_CRITERIA[op_key]( - float(metric_value.removeprefix(op_key)) - ) - break - else: - raise ValueError( - f"Invalid operator for SLA constraint '{metric_key}={metric_value}'. " - f"Valid operators are: {set(SLA_CRITERIA)}", - ) - - return sla_criteria - - -def _parse_sla(sla: list[dict[str, str]]): - return [_parse_sla_item(item) for item in sla] - - -# In JSON, we prefer "_" -def _iter_param_key_candidates(param_key: str): - yield param_key - yield param_key.replace("-", "_") - yield param_key.replace("_", "-") - - -# In CLI, we prefer "-" -def _iter_cmd_key_candidates(param_key: str): - for k in reversed(tuple(_iter_param_key_candidates(param_key))): - yield "--" + k - - -def _normalize_cmd_key(param_key: str): - return next(_iter_cmd_key_candidates(param_key)) - - -def _override_args(cmd: list[str], params: dict[str, object]): - cmd = list(cmd) - - for k, v in params.items(): - for k_candidate in _iter_cmd_key_candidates(k): - try: - k_idx = cmd.index(k_candidate) - - if isinstance(v, bool): - cmd[k_idx] = _normalize_cmd_key(k if v else "no-" + k) - else: - cmd[k_idx + 1] = str(v) - - break - except ValueError: - continue - else: - if isinstance(v, bool): - cmd.append(_normalize_cmd_key(k if v else "no-" + k)) - else: - cmd.extend([_normalize_cmd_key(k), str(v)]) - - return cmd - - -class ServerWrapper: - def __init__( - self, - server_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - ) -> None: - super().__init__() - - self.server_cmd = server_cmd - self.after_bench_cmd = after_bench_cmd - self.show_stdout = show_stdout - - def run_subcommand(self, cmd: list[str]): - return subprocess.run( - cmd, - stdout=None if self.show_stdout else subprocess.DEVNULL, - check=True, - ) - - def after_bench(self) -> None: - if not self.after_bench_cmd: - self.reset_caches() - return - - self.run_subcommand(self.after_bench_cmd) - - def _get_vllm_server_address(self) -> str: - server_cmd = self.server_cmd - - for host_key in ("--host",): - if host_key in server_cmd: - host = server_cmd[server_cmd.index(host_key) + 1] - break - else: - host = "localhost" - - for port_key in ("-p", "--port"): - if port_key in server_cmd: - port = int(server_cmd[server_cmd.index(port_key) + 1]) - break - else: - port = 8000 # The default value in vllm serve - - return f"http://{host}:{port}" - - def reset_caches(self) -> None: - server_cmd = self.server_cmd - - # Use `.endswith()` to match `/bin/...` - if server_cmd[0].endswith("vllm"): - server_address = self._get_vllm_server_address() - print(f"Resetting caches at {server_address}") - - res = requests.post(f"{server_address}/reset_prefix_cache") - res.raise_for_status() - - res = requests.post(f"{server_address}/reset_mm_cache") - res.raise_for_status() - elif server_cmd[0].endswith("infinity_emb"): - if "--vector-disk-cache" in server_cmd: - raise NotImplementedError( - "Infinity server uses caching but does not expose a method " - "to reset the cache" - ) - else: - raise NotImplementedError( - f"No implementation of `reset_caches` for `{server_cmd[0]}` server. " - "Please specify a custom command via `--after-bench-cmd`." - ) - - -@contextlib.contextmanager -def _run_server( - serve_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_overrides: dict[str, object], - dry_run: bool, -): - server_cmd = _override_args(serve_cmd, serve_overrides) - - print("[BEGIN SERVER]") - print(f"Server overrides: {serve_overrides}") - print(f"Server command: {server_cmd}") - - if dry_run: - yield None - print("[END SERVER]") - return - - # Create new process group for clean termination - server_process = subprocess.Popen( - server_cmd, - start_new_session=True, - stdout=None if show_stdout else subprocess.DEVNULL, - # Need VLLM_SERVER_DEV_MODE=1 for `_reset_caches` - env={**os.environ, "VLLM_SERVER_DEV_MODE": "1"}, - ) - - try: - yield ServerWrapper( - server_cmd, - after_bench_cmd, - show_stdout=show_stdout, - ) - finally: - if server_process.poll() is None: - # In case only some processes have been terminated - with contextlib.suppress(ProcessLookupError): - # We need to kill both API Server and Engine processes - os.killpg(os.getpgid(server_process.pid), signal.SIGKILL) - - print("[END SERVER]") - - -def _run_benchmark( - server: ServerWrapper | None, - bench_cmd: list[str], - *, - serve_overrides: dict[str, object], - bench_overrides: dict[str, object], - run_number: int, - output_path: Path, - dry_run: bool, -): - benchmark_cmd = [ - *_override_args(bench_cmd, bench_overrides), - "--save-result", - "--result-dir", - str(output_path.parent), - "--result-filename", - output_path.name, - ] - - print("[BEGIN BENCHMARK]") - print(f"Benchmark overrides: {bench_overrides}") - print(f"Run Number: {run_number}") - print(f"Benchmark command: {benchmark_cmd}") - print(f"Output file: {output_path}") - - run_data: dict[str, object] - - if output_path.exists(): - print("Found existing results. Skipping.") - - with output_path.open("rb") as f: - run_data = json.load(f) - return run_data - - if server is None: - assert dry_run - print("[END BENCHMARK]") - return None - - output_path.parent.mkdir(parents=True, exist_ok=True) - - server.run_subcommand(benchmark_cmd) - server.after_bench() - - with output_path.open("rb") as f: - run_data = json.load(f) - - run_data["run_number"] = run_number - run_data.update(serve_overrides) - - with output_path.open("w") as f: - json.dump(run_data, f, indent=4) - - print("[END BENCHMARK]") - - return run_data - - -def _get_comb_base_path( - output_dir: Path, - serve_comb: dict[str, object], - bench_comb: dict[str, object], -): - return output_dir / "-".join( - ( - "SERVE", - *(f"{k}={v}" for k, v in serve_comb.items()), - "BENCH", - *(f"{k}={v}" for k, v in bench_comb.items()), - ) - ).replace("/", "_").replace("..", "__") # Sanitize - - -def _get_comb_run_path(base_path: Path, run_number: int | None): - if run_number is None: - return base_path / "summary.json" - - return base_path / f"run={run_number}.json" - - -def _comb_needs_server( - serve_comb: dict[str, object], - bench_combs: list[dict[str, object]], - output_dir: Path, -): - for bench_comb in bench_combs: - base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb) - if not _get_comb_run_path(base_path, run_number=None).exists(): - return True - - return False - - -def _run_comb( - server: ServerWrapper | None, - bench_cmd: list[str], - *, - serve_comb: dict[str, object], - bench_comb: dict[str, object], - base_path: Path, - num_runs: int, - dry_run: bool, -): - comb_data = list[dict[str, object]]() - - for run_number in range(num_runs): - run_data = _run_benchmark( - server, - bench_cmd, - serve_overrides=serve_comb, - bench_overrides=bench_comb, - run_number=run_number, - output_path=_get_comb_run_path(base_path, run_number), - dry_run=dry_run, - ) - - if run_data is not None: - comb_data.append(run_data) - - if dry_run: - return None - - with _get_comb_run_path(base_path, run_number=None).open("w") as f: - json.dump(comb_data, f, indent=4) - - return comb_data - - -def run_combs( - serve_cmd: list[str], - bench_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_params: list[dict[str, object]], - bench_params: list[dict[str, object]], - output_dir: Path, - num_runs: int, - dry_run: bool, -): - all_data = list[dict[str, object]]() - for serve_comb in serve_params: - with ( - _run_server( - serve_cmd, - after_bench_cmd, - show_stdout=show_stdout, - serve_overrides=serve_comb, - dry_run=dry_run, - ) - if _comb_needs_server(serve_comb, bench_params, output_dir) - else contextlib.nullcontext() - ) as server: - for bench_comb in bench_params: - base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb) - - comb_data = _run_comb( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb, - base_path=base_path, - num_runs=num_runs, - dry_run=dry_run, - ) - - if comb_data is not None: - all_data.extend(comb_data) - - if dry_run: - return None - - combined_df = pd.DataFrame.from_records(all_data) - combined_df.to_csv(output_dir / "summary.csv") - - return combined_df - - -def _get_sla_base_path( - output_dir: Path, - serve_comb: dict[str, object], - bench_comb: dict[str, object], -): - return output_dir / "-".join( - ( - "SERVE", - *(f"{k}={v}" for k, v in serve_comb.items()), - "BENCH", - *(f"{k}={v}" for k, v in bench_comb.items()), - ) - ).replace("/", "_").replace("..", "__") # Sanitize - - -def _get_sla_iter_path( - base_path: Path, - sla_comb: dict[str, SLACriterionBase], - sla_variable: str, - sla_value: int | None, -): - if sla_value is None: - prefix = "-".join(v.format_cond(k) for k, v in sla_comb.items()) - return base_path / f"SLA-{prefix}.json" - - return base_path / f"{sla_variable}={sla_value}" - - -def _get_sla_run_path(iter_path: Path, run_number: int | None): - if run_number is None: - return iter_path / "summary.json" - - return iter_path / f"run={run_number}.json" - - -def _sla_needs_server( - serve_comb: dict[str, object], - bench_combs: list[dict[str, object]], - sla_combs: list[dict[str, SLACriterionBase]], - sla_variable: str, - output_dir: Path, -): - for bench_comb in bench_combs: - base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) - for sla_comb in sla_combs: - if not _get_sla_iter_path( - base_path, - sla_comb, - sla_variable, - sla_value=None, - ).exists(): - return True - - return False - - -def _run_sla( - server: ServerWrapper | None, - bench_cmd: list[str], - *, - serve_comb: dict[str, object], - bench_comb: dict[str, object], - iter_path: Path, - num_runs: int, - dry_run: bool, -): - iter_data = list[dict[str, object]]() - - for run_number in range(num_runs): - run_data = _run_benchmark( - server, - bench_cmd, - serve_overrides=serve_comb, - bench_overrides=bench_comb, - run_number=run_number, - output_path=_get_sla_run_path(iter_path, run_number), - dry_run=dry_run, - ) - - if run_data is not None: - iter_data.append(run_data) - - if dry_run: - return None - - with _get_sla_run_path(iter_path, run_number=None).open("w") as f: - json.dump(iter_data, f, indent=4) - - return iter_data - - -SLAVariable = Literal["request_rate", "max_concurrency"] - - -def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable): - request_throughput = float(run_data["request_throughput"]) # type: ignore - if sla_variable == "request_rate": - return request_throughput - if sla_variable == "max_concurrency": - mean_latency_ms = float(run_data["mean_e2el_ms"]) # type: ignore - return request_throughput * mean_latency_ms / 1000 - - assert_never(sla_variable) - - -def _estimate_sla_bounds( - server: ServerWrapper | None, - bench_cmd: list[str], - *, - serve_comb: dict[str, object], - bench_comb: dict[str, object], - sla_comb: dict[str, SLACriterionBase], - base_path: Path, - num_runs: int, - dry_run: bool, - sla_variable: SLAVariable, - init_value: int, - max_value: int, -): - sla_data = list[dict[str, object]]() - - max_passing: int = 0 - min_failing: int = 0 - - val: int = init_value - assert val > 0 - - while True: - print(f"Testing {sla_variable}: {val} req/s") - - iter_data = _run_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb={**bench_comb, sla_variable: val}, - iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), - num_runs=num_runs, - dry_run=dry_run, - ) - - assert iter_data is not None - sla_data.extend(iter_data) - - iter_data_mean = { - k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore - for k in sla_comb - } - - sla_results = [ - criterion.print_and_validate(iter_data_mean, k) - for k, criterion in sla_comb.items() - ] - - if all(sla_results): - print("SLA criteria are met.") - max_passing = val - val *= 2 - else: - print("SLA criteria are not met.") - min_failing = val - break - - if val >= max_value: - break - - return sla_data, (max_passing, min_failing) - - -def _find_sla_value( - server: ServerWrapper | None, - bench_cmd: list[str], - *, - serve_comb: dict[str, object], - bench_comb: dict[str, object], - sla_comb: dict[str, SLACriterionBase], - base_path: Path, - num_runs: int, - dry_run: bool, - sla_variable: SLAVariable, - min_value: int, - max_value: int, -): - sla_data = list[dict[str, object]]() - - left: int = min_value - right: int = max_value - - while True: - val = (left + right) // 2 - print(f"Testing {sla_variable}: {val} req/s") - - iter_data = _run_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb={**bench_comb, sla_variable: val}, - iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), - num_runs=num_runs, - dry_run=dry_run, - ) - - assert iter_data is not None - sla_data.extend(iter_data) - - iter_data_mean = { - k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore - for k in sla_comb - } - - sla_results = [ - criterion.print_and_validate(iter_data_mean, k) - for k, criterion in sla_comb.items() - ] - - if all(sla_results): - print("SLA criteria are met.") - left = val - else: - print("SLA criteria are not met.") - right = val - - if right - left <= 1: - break - - return sla_data, left - - -def _search_sla( - server: ServerWrapper | None, - bench_cmd: list[str], - *, - serve_comb: dict[str, object], - bench_comb: dict[str, object], - sla_comb: dict[str, SLACriterionBase], - sla_variable: SLAVariable, - sla_inf_value: int = 65536, # The value that represents infinite QPS - base_path: Path, - num_runs: int, - dry_run: bool, -): - print("[SLA START]") - print(f"SLA criteria: {', '.join(v.format_cond(k) for k, v in sla_comb.items())}") - - sla_data_0 = _run_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb={**bench_comb, sla_variable: sla_inf_value}, - iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value), - num_runs=num_runs, - dry_run=dry_run, - ) - if sla_data_0 is None: - assert dry_run - print("Omitting SLA search.") - print("[SLA END]") - return None - - sla_init_value = math.ceil( - sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0) - / len(sla_data_0) - ) - print(f"Initial {sla_variable} to search: {sla_init_value} req/s.") - - sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb, - sla_comb=sla_comb, - base_path=base_path, - num_runs=num_runs, - dry_run=dry_run, - sla_variable=sla_variable, - init_value=sla_init_value, - max_value=sla_inf_value, - ) - print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.") - - sla_data_2, sla_value = _find_sla_value( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb, - sla_comb=sla_comb, - base_path=base_path, - num_runs=num_runs, - dry_run=dry_run, - sla_variable=sla_variable, - min_value=sla_min, - max_value=sla_max, - ) - - sla_data = sla_data_0 + sla_data_1 + sla_data_2 - print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.") - - with _get_sla_iter_path( - base_path, - sla_comb, - sla_variable, - sla_value=None, - ).open("w") as f: - json.dump(sla_data, f, indent=4) - - print("[SLA END]") - - return sla_data - - -def _plot_throughput_latency_curve( - all_data: list[dict[str, object]], - serve_combs: list[dict[str, object]], - bench_comb: dict[str, object], - output_dir: Path, -): - fig_path = output_dir / "-".join( - ( - "BENCH", - *(f"{k}={v}" for k, v in bench_comb.items()), - ) - ).replace("/", "_").replace("..", "__") # Sanitize - - df = pd.DataFrame.from_records( - [item for item in all_data if all(item[k] == bench_comb[k] for k in bench_comb)] - ) - - # Group together points with similar throughput - df["request_throughput"] = df["request_throughput"].round() - - # Preserve the key order using dictionary - all_comb_keys = {k: None for comb in serve_combs for k in comb} - for k in all_comb_keys: - df[k] = df[k].astype(str) - - keys_per_comb = [comb.keys() for comb in serve_combs] - if ( - all(ks == keys_per_comb[0] for ks in keys_per_comb) - and len(keys_per_comb[0]) <= 3 - ): - hue, style, size, *_ = (*keys_per_comb[0], None, None) - ax = sns.lineplot( - df, - x="request_throughput", - y="p99_e2el_ms", - hue=hue, - style=style, - size=size, - markers=True, - ) - else: - df["category"] = df[list(all_comb_keys)].agg("-".join, axis=1) - ax = sns.lineplot( - df, - x="request_throughput", - y="p99_e2el_ms", - hue="category", - markers=True, - ) - - sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) - - fig = ax.get_figure() - assert fig is not None - - fig.tight_layout() - fig.savefig(fig_path) - - -def _plot_throughput_latency_curves( - all_data: list[dict[str, object]], - serve_combs: list[dict[str, object]], - bench_combs: list[dict[str, object]], - output_dir: Path, -): - for bench_comb in bench_combs: - _plot_throughput_latency_curve(all_data, serve_combs, bench_comb, output_dir) - - -def run_slas( - serve_cmd: list[str], - bench_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_params: list[dict[str, object]], - bench_params: list[dict[str, object]], - sla_params: list[dict[str, SLACriterionBase]], - sla_variable: SLAVariable, - output_dir: Path, - num_runs: int, - dry_run: bool, -): - if any( - k in bench_comb - for bench_comb in bench_params - for k in _iter_param_key_candidates(sla_variable) - ): - raise ValueError( - f"You should not override `{sla_variable}` in `bench_params` in SLA mode, " - "since it is supposed to be determined automatically." - ) - - all_data = list[dict[str, object]]() - for serve_comb in serve_params: - with ( - _run_server( - serve_cmd, - after_bench_cmd, - show_stdout=show_stdout, - serve_overrides=serve_comb, - dry_run=dry_run, - ) - if _sla_needs_server( - serve_comb, - bench_params, - sla_params, - sla_variable, - output_dir, - ) - else contextlib.nullcontext() - ) as server: - for bench_comb in bench_params: - for sla_comb in sla_params: - base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) - - comb_data = _search_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb, - sla_comb=sla_comb, - sla_variable=sla_variable, - base_path=base_path, - num_runs=num_runs, - dry_run=dry_run, - ) - - if comb_data is not None: - all_data.extend(comb_data) - - if dry_run: - return None - - combined_df = pd.DataFrame.from_records(all_data) - combined_df.to_csv(output_dir / "summary.csv") - - _plot_throughput_latency_curves(all_data, serve_params, bench_params, output_dir) - - return combined_df - - -def _run_main( - serve_cmd: list[str], - bench_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_params: list[dict[str, object]], - bench_params: list[dict[str, object]], - sla_params: list[dict[str, SLACriterionBase]], - sla_variable: SLAVariable, - output_dir: Path, - num_runs: int, - dry_run: bool, -): - if sla_params: - return run_slas( - serve_cmd=serve_cmd, - bench_cmd=bench_cmd, - after_bench_cmd=after_bench_cmd, - show_stdout=show_stdout, - serve_params=serve_params, - bench_params=bench_params, - sla_params=sla_params, - sla_variable=sla_variable, - output_dir=output_dir, - num_runs=num_runs, - dry_run=dry_run, - ) - - return run_combs( - serve_cmd=serve_cmd, - bench_cmd=bench_cmd, - after_bench_cmd=after_bench_cmd, - show_stdout=show_stdout, - serve_params=serve_params, - bench_params=bench_params, - output_dir=output_dir, - num_runs=num_runs, - dry_run=dry_run, - ) - - -def run_main( - serve_cmd: list[str], - bench_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_params: list[dict[str, object]], - bench_params: list[dict[str, object]], - sla_params: list[dict[str, SLACriterionBase]], - sla_variable: SLAVariable, - output_dir: Path, - num_runs: int, - dry_run: bool, - resume: str | None, -): - timestamp = resume or datetime.now().strftime("%Y%m%d_%H%M%S") - output_dir = output_dir / timestamp - - if resume and not output_dir.exists(): - raise ValueError(f"Cannot resume from non-existent directory ({output_dir})") - - try: - return _run_main( - serve_cmd=serve_cmd, - bench_cmd=bench_cmd, - after_bench_cmd=after_bench_cmd, - show_stdout=show_stdout, - serve_params=serve_params, - bench_params=bench_params, - sla_params=sla_params, - sla_variable=sla_variable, - output_dir=output_dir, - num_runs=num_runs, - dry_run=dry_run, - ) - except BaseException as exc: - raise RuntimeError( - f"The script was terminated early. Use `--resume {timestamp}` " - f"to continue the script from its last checkpoint." - ) from exc - - -def main(): - parser = argparse.ArgumentParser( - description="Run vLLM server benchmark on a parameter grid of settings." - ) - parser.add_argument( - "--serve-cmd", - type=str, - required=True, - help="The command used to run the server: `vllm serve ...`", - ) - parser.add_argument( - "--bench-cmd", - type=str, - required=True, - help="The command used to run the benchmark: `vllm bench serve ...`", - ) - parser.add_argument( - "--after-bench-cmd", - type=str, - default=None, - help="After a benchmark run is complete, invoke this command instead of the " - "default `ServerWrapper.clear_cache()`.", - ) - parser.add_argument( - "--show-stdout", - action="store_true", - help="If set, logs the standard output of subcommands. " - "Useful for debugging but can be quite spammy.", - ) - parser.add_argument( - "--serve-params", - type=str, - default=None, - help="Path to JSON file containing a list of parameter combinations " - "for the `vllm serve` command. " - "If both `serve_params` and `bench_params` are given, " - "this script will iterate over their Cartesian product.", - ) - parser.add_argument( - "--bench-params", - type=str, - default=None, - help="Path to JSON file containing a list of parameter combinations " - "for the `vllm bench serve` command. " - "If both `serve_params` and `bench_params` are given, " - "this script will iterate over their Cartesian product.", - ) - parser.add_argument( - "--sla-params", - type=str, - default=None, - help="Path to JSON file containing a list of SLA constraints to satisfy. " - 'Each constraint is expressed in `{"": ""}` format, ' - 'e.g.: `{"p99_e2el_ms": "<=500"}` means that ' - "the E2E latency should be less than 500ms 99% of the time. " - "Setting this option runs this script in SLA mode, which searches for the " - "maximum `sla_variable` that satisfies the constraints for each combination " - "of `serve_params`, `bench_params`, and `sla_params`.", - ) - parser.add_argument( - "--sla-variable", - type=str, - choices=get_args(SLAVariable), - default="request_rate", - help="Whether to tune request rate or maximum concurrency to satisfy " - "the SLA constraints.", - ) - parser.add_argument( - "-o", - "--output-dir", - type=str, - default="results", - help="The directory to which results are written.", - ) - parser.add_argument( - "--num-runs", - type=int, - default=3, - help="Number of runs per parameter combination.", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="If set, prints the commands to run then exits without running them.", - ) - parser.add_argument( - "--resume", - type=str, - default=None, - help="Set this to the name of a directory under `output_dir` (which is a " - "timestamp) to resume a previous execution of this script, i.e., only run " - "parameter combinations for which there are still no output files.", - ) - - args = parser.parse_args() - - serve_cmd = shlex.split(args.serve_cmd) - bench_cmd = shlex.split(args.bench_cmd) - after_bench_cmd = ( - [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd) - ) - - serve_params: list[dict[str, object]] - if args.serve_params: - with open(args.serve_params, "rb") as f: - serve_params = _parse_params(json.load(f)) - else: - # i.e.: run serve_cmd without any modification - serve_params = [{}] - - bench_params: list[dict[str, object]] - if args.bench_params: - with open(args.bench_params, "rb") as f: - bench_params = _parse_params(json.load(f)) - else: - # i.e.: run bench_cmd without any modification - bench_params = [{}] - - sla_params: list[dict[str, SLACriterionBase]] - if args.sla_params: - with open(args.sla_params, "rb") as f: - sla_params = _parse_sla(json.load(f)) - else: - sla_params = [] - - num_runs = args.num_runs - if num_runs < 1: - raise ValueError("`num_runs` should be at least 1.") - - run_main( - serve_cmd=serve_cmd, - bench_cmd=bench_cmd, - after_bench_cmd=after_bench_cmd, - show_stdout=args.show_stdout, - serve_params=serve_params, - bench_params=bench_params, - sla_params=sla_params, - sla_variable=args.sla_variable, - output_dir=Path(args.output_dir), - num_runs=num_runs, - dry_run=args.dry_run, - resume=args.resume, - ) - - -if __name__ == "__main__": - main() From ae9d02133ff163e9b61945d738a08385a190a0cf Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:49:43 +0000 Subject: [PATCH 28/48] Update import Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 53887c82ee49..8637c9dfdec1 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -13,7 +13,7 @@ import seaborn as sns from typing_extensions import Self, override -from vllm.utils.collections import full_groupby +from vllm.utils.collection_utils import full_groupby class PlotFilterBase(ABC): From beb3854be6bc08e57ca535c8f979b7cf31ee3e9d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 15:57:36 +0000 Subject: [PATCH 29/48] Clean up Signed-off-by: DarkLight1337 --- docs/contributing/benchmarks.md | 6 +++--- vllm/benchmarks/sweep/param_sweep.py | 2 +- vllm/benchmarks/sweep/plot.py | 20 ++++++++---------- vllm/benchmarks/sweep/serve.py | 31 ++++++++++++++-------------- vllm/benchmarks/sweep/utils.py | 4 ++++ 5 files changed, 32 insertions(+), 31 deletions(-) create mode 100644 vllm/benchmarks/sweep/utils.py diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 6ff475422e9f..225eb73c142f 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -996,7 +996,7 @@ The basic purpose of this script is to evaluate vLLM under different settings. F Example command: ```bash -python vllm/benchmarks/sweep/serve.py \ +python -m vllm.benchmarks.sweep.serve \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \ --serve-params benchmarks/serve_hparams.json \ @@ -1044,7 +1044,7 @@ For example, to ensure E2E latency within different target values for 99% of req Example command: ```bash -python vllm/benchmarks/sweep/serve.py \ +python -m vllm.benchmarks.sweep.serve \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \ --serve-params benchmarks/serve_hparams.json \ @@ -1073,7 +1073,7 @@ The algorithm for adjusting the SLA variable is as follows: Example command: ```bash -python vllm/benchmarks/sweep/plot.py benchmarks/results/ \ +python -m vllm.benchmarks.sweep.plot benchmarks/results/ \ --var-x max_concurrency \ --curve-by api_server_count,max_num_batched_tokens \ --row-by random_input_len \ diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py index 90006e02ba6a..986561ed8502 100644 --- a/vllm/benchmarks/sweep/param_sweep.py +++ b/vllm/benchmarks/sweep/param_sweep.py @@ -36,7 +36,7 @@ def from_record(cls, record: dict[str, object]): return cls(record) def __or__(self, other: dict[str, Any]): - return type(self)(self | other) + return type(self)(super().__or__(other)) # In JSON, we prefer "_" def _iter_param_key_candidates(self, param_key: str): diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 8637c9dfdec1..35e0c7e88e5a 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -15,6 +15,8 @@ from vllm.utils.collection_utils import full_groupby +from .utils import sanitize_filename + class PlotFilterBase(ABC): @classmethod @@ -152,17 +154,13 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]): def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]): - return fig_dir / ( - "-".join( - ( - "FIGURE" + ("-" if group else ""), - *(f"{k}={v}" for k, v in group), - ) - ) - .replace("/", "_") - .replace("..", "__") # Sanitize - + ".png" - ) + parts = list[str]() + if group: + parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group))) + else: + parts.append("figure") + + return fig_dir / sanitize_filename("-".join(parts) + ".png") class DummyExecutor: diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index 5599a5dbc78b..cc64b3af6230 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -15,6 +15,7 @@ from .param_sweep import ParameterSweep, ParameterSweepItem from .server import ServerProcess from .sla_sweep import SLASweep, SLASweepItem +from .utils import sanitize_filename @contextlib.contextmanager @@ -106,14 +107,13 @@ def _get_comb_base_path( serve_comb: ParameterSweepItem, bench_comb: ParameterSweepItem, ): - return output_dir / "-".join( - ( - "SERVE" + ("-" if serve_comb else ""), - serve_comb.as_text(sep="-"), - "BENCH" + ("-" if bench_comb else ""), - bench_comb.as_text(sep="-"), - ) - ).replace("/", "_").replace("..", "__") # Sanitize + parts = list[str]() + if serve_comb: + parts.extend(("SERVE-", *serve_comb.as_text(sep="-"))) + if bench_comb: + parts.extend(("BENCH-", *bench_comb.as_text(sep="-"))) + + return output_dir / sanitize_filename("-".join(parts)) def _get_comb_run_path(base_path: Path, run_number: int | None): @@ -226,14 +226,13 @@ def _get_sla_base_path( serve_comb: ParameterSweepItem, bench_comb: ParameterSweepItem, ): - return output_dir / "-".join( - ( - "SERVE" + ("-" if serve_comb else ""), - serve_comb.as_text(sep="-"), - "BENCH" + ("-" if bench_comb else ""), - bench_comb.as_text(sep="-"), - ) - ).replace("/", "_").replace("..", "__") # Sanitize + parts = list[str]() + if serve_comb: + parts.extend(("SERVE-", *serve_comb.as_text(sep="-"))) + if bench_comb: + parts.extend(("BENCH-", *bench_comb.as_text(sep="-"))) + + return output_dir / sanitize_filename("-".join(parts)) def _get_sla_iter_path( diff --git a/vllm/benchmarks/sweep/utils.py b/vllm/benchmarks/sweep/utils.py new file mode 100644 index 000000000000..5a9e7d932b59 --- /dev/null +++ b/vllm/benchmarks/sweep/utils.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +def sanitize_filename(filename: str) -> str: + return filename.replace("/", "_").replace("..", "__") From 5f36c62774e701bcc00046a7a4031bf5f2cc4e3c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:01:22 +0000 Subject: [PATCH 30/48] Fix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/serve.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index cc64b3af6230..fb966ce41ab9 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -109,9 +109,9 @@ def _get_comb_base_path( ): parts = list[str]() if serve_comb: - parts.extend(("SERVE-", *serve_comb.as_text(sep="-"))) + parts.extend(("SERVE-", serve_comb.as_text(sep="-").split("-"))) if bench_comb: - parts.extend(("BENCH-", *bench_comb.as_text(sep="-"))) + parts.extend(("BENCH-", bench_comb.as_text(sep="-").split("-"))) return output_dir / sanitize_filename("-".join(parts)) @@ -228,9 +228,9 @@ def _get_sla_base_path( ): parts = list[str]() if serve_comb: - parts.extend(("SERVE-", *serve_comb.as_text(sep="-"))) + parts.extend(("SERVE-", serve_comb.as_text(sep="-"))) if bench_comb: - parts.extend(("BENCH-", *bench_comb.as_text(sep="-"))) + parts.extend(("BENCH-", bench_comb.as_text(sep="-"))) return output_dir / sanitize_filename("-".join(parts)) From 8fbfd49f95e8c8ceda4d631db116efde0bc69817 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:11:13 +0000 Subject: [PATCH 31/48] Simplify Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 35e0c7e88e5a..f8cba47bfc37 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -373,9 +373,9 @@ def main(): parser.add_argument( "--fig-dir", type=str, - default=None, - help="The directory to save the figures. " - "By default, this is set to `OUTPUT_DIR`.", + default="", + help="The directory to save the figures, relative to `OUTPUT_DIR`. " + "By default, the same directory is used.", ) parser.add_argument( "--curve-by", @@ -459,14 +459,18 @@ def main(): args = parser.parse_args() + output_dir = Path(args.OUTPUT_DIR) + if not output_dir.exists(): + raise ValueError(f"No parameter sweep results under {output_dir}") + curve_by = [] if not args.curve_by else args.curve_by.split(",") row_by = [] if not args.row_by else args.row_by.split(",") col_by = [] if not args.col_by else args.col_by.split(",") fig_by = [] if not args.fig_by else args.fig_by.split(",") plot( - output_dir=Path(args.OUTPUT_DIR), - fig_dir=Path(args.fig_dir or args.OUTPUT_DIR), + output_dir=output_dir, + fig_dir=output_dir / args.fig_dir, fig_by=fig_by, row_by=row_by, col_by=col_by, From daee7a84f34dc670ba4385f51d9b97e5635a484c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:17:50 +0000 Subject: [PATCH 32/48] Fix legend Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index f8cba47bfc37..f31f6d685d82 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -278,6 +278,8 @@ def _plot_fig( size=size, markers=True, ) + + g.add_legend(title=hue) else: df["curve_group"] = ( pd.concat( @@ -296,7 +298,7 @@ def _plot_fig( markers=True, ) - g.add_legend() + g.add_legend() g.savefig(fig_path) plt.close(g.figure) From b9e08ffe0057249086fdc0e5a9d38c105ee4ba80 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:22:38 +0000 Subject: [PATCH 33/48] Fix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index f31f6d685d82..7c9678c21e10 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -448,7 +448,8 @@ def main(): ) parser.add_argument( "--scale-y", - action="store_true", + type=str, + default=None, help="The scale to use for the y-axis. " "Currently only accepts string values such as 'log' and 'sqrt'. " "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", From ad4149b12693fe03152c909f3a73aeed6e57aa64 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:26:27 +0000 Subject: [PATCH 34/48] Reword Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 3 ++- vllm/benchmarks/sweep/serve.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 7c9678c21e10..691d2123dbab 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -457,7 +457,8 @@ def main(): parser.add_argument( "--dry-run", action="store_true", - help="If set, prints the location of the figures without drawing them.", + help="If set, prints the information about each figure to plot, " + "then exits without drawing them.", ) args = parser.parse_args() diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index fb966ce41ab9..c1fad83fdfb2 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -763,7 +763,7 @@ def main(): parser.add_argument( "--dry-run", action="store_true", - help="If set, prints the commands to run then exits without running them.", + help="If set, prints the commands to run, then exits without executing them.", ) parser.add_argument( "--resume", From c5eaf789c1dce76bb46e6df82dfa7b4724f450c1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:28:07 +0000 Subject: [PATCH 35/48] Fix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 691d2123dbab..82ae3b294a45 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -118,7 +118,7 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: PLOT_BINNERS: dict[str, type[PlotBinner]] = { - "@": PlotBinner, + "%": PlotBinner, } From c7426c254c6aa80cfea14028cacde887752ea0ad Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:29:57 +0000 Subject: [PATCH 36/48] Reword Signed-off-by: DarkLight1337 --- docs/contributing/benchmarks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 225eb73c142f..f119a03d28dc 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1082,7 +1082,7 @@ python -m vllm.benchmarks.sweep.plot benchmarks/results/ \ ``` !!! tip - You can use `--dry-run` to preview the commands to be run. + You can use `--dry-run` to preview the figures to be plotted. ## Performance Benchmarks From 49deaab208425bf7504cf0bb329008812b25110e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 16:32:16 +0000 Subject: [PATCH 37/48] Reorder Signed-off-by: DarkLight1337 --- docs/contributing/benchmarks.md | 2 +- vllm/benchmarks/sweep/plot.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index f119a03d28dc..99ca3c5dd234 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1075,9 +1075,9 @@ Example command: ```bash python -m vllm.benchmarks.sweep.plot benchmarks/results/ \ --var-x max_concurrency \ - --curve-by api_server_count,max_num_batched_tokens \ --row-by random_input_len \ --col-by random_output_len \ + --curve-by api_server_count,max_num_batched_tokens \ --filter-by 'max_concurrency<=1024' ``` diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 82ae3b294a45..eb8d997ab1da 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -380,31 +380,31 @@ def main(): "By default, the same directory is used.", ) parser.add_argument( - "--curve-by", + "--fig-by", type=str, - default=None, - help="A comma-separated list of variables, such that a separate curve " + default="", + help="A comma-separated list of variables, such that a separate figure " "is created for each combination of these variables.", ) parser.add_argument( - "--col-by", + "--row-by", type=str, default="", - help="A comma-separated list of variables, such that a separate column " + help="A comma-separated list of variables, such that a separate row " "is created for each combination of these variables.", ) parser.add_argument( - "--row-by", + "--col-by", type=str, default="", - help="A comma-separated list of variables, such that a separate row " + help="A comma-separated list of variables, such that a separate column " "is created for each combination of these variables.", ) parser.add_argument( - "--fig-by", + "--curve-by", type=str, - default="", - help="A comma-separated list of variables, such that a separate figure " + default=None, + help="A comma-separated list of variables, such that a separate curve " "is created for each combination of these variables.", ) parser.add_argument( From 0a4eb3693b61abef79c0446b3bd16eb4e28b31f7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 20 Oct 2025 17:37:36 +0000 Subject: [PATCH 38/48] Informative error Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/serve.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index c1fad83fdfb2..49014738911f 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -79,7 +79,9 @@ def _run_benchmark( return run_data if server is None: - assert dry_run + if not dry_run: + raise ValueError(f"Cannot find results at {output_path}") + print("[END BENCHMARK]") return None From 8afa4d3f69f7b770414bbaf1e6a3589d48d02e0a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 01:46:01 +0000 Subject: [PATCH 39/48] Separate out SLA tuner Signed-off-by: DarkLight1337 --- docs/contributing/benchmarks.md | 12 +- vllm/benchmarks/sweep/plot.py | 14 +- vllm/benchmarks/sweep/serve.py | 712 ++++++----------------------- vllm/benchmarks/sweep/serve_sla.py | 483 +++++++++++++++++++ 4 files changed, 632 insertions(+), 589 deletions(-) create mode 100644 vllm/benchmarks/sweep/serve_sla.py diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 99ca3c5dd234..89524ed3bc63 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -929,11 +929,9 @@ throughput numbers correctly is also adjusted. ### Online Benchmark -[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` over multiple configurations. +[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations. -#### Batch Mode - -The basic purpose of this script is to evaluate vLLM under different settings. Follows these steps to run the script: +Follow these steps to run the script: 1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option. 2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option. @@ -1018,9 +1016,9 @@ python -m vllm.benchmarks.sweep.serve \ !!! tip You can use the `--resume` option to continue the parameter sweep if one of the runs failed. -#### SLA Mode +### SLA Auto-Tuner -By passing SLA constraints via `--sla-params`, you can run this script in SLA mode, causing it to adjust either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints. +[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`. For example, to ensure E2E latency within different target values for 99% of requests: @@ -1044,7 +1042,7 @@ For example, to ensure E2E latency within different target values for 99% of req Example command: ```bash -python -m vllm.benchmarks.sweep.serve \ +python -m vllm.benchmarks.sweep.serve_sla \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \ --serve-params benchmarks/serve_hparams.json \ diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index eb8d997ab1da..8cdbe5980e8f 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -361,10 +361,7 @@ def plot( ) -def main(): - parser = argparse.ArgumentParser( - description="Plot performance curves from parameter sweep results." - ) +def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "OUTPUT_DIR", type=str, @@ -461,8 +458,8 @@ def main(): "then exits without drawing them.", ) - args = parser.parse_args() +def main(args: argparse.Namespace): output_dir = Path(args.OUTPUT_DIR) if not output_dir.exists(): raise ValueError(f"No parameter sweep results under {output_dir}") @@ -490,4 +487,9 @@ def main(): if __name__ == "__main__": - main() + parser = argparse.ArgumentParser( + description="Plot performance curves from parameter sweep results." + ) + add_cli_args(parser) + + main(parser.parse_args()) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index 49014738911f..e99052247e71 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -3,23 +3,20 @@ import argparse import contextlib import json -import math import shlex +from dataclasses import dataclass from datetime import datetime from pathlib import Path -from typing import Literal, get_args import pandas as pd -from typing_extensions import assert_never from .param_sweep import ParameterSweep, ParameterSweepItem from .server import ServerProcess -from .sla_sweep import SLASweep, SLASweepItem from .utils import sanitize_filename @contextlib.contextmanager -def _run_server( +def run_server( serve_cmd: list[str], after_bench_cmd: list[str], *, @@ -44,7 +41,7 @@ def _run_server( print("[END SERVER]") -def _run_benchmark( +def run_benchmark( server: ServerProcess | None, bench_cmd: list[str], *, @@ -138,7 +135,7 @@ def _comb_needs_server( return False -def _run_comb( +def run_comb( server: ServerProcess | None, bench_cmd: list[str], *, @@ -151,7 +148,7 @@ def _run_comb( comb_data = list[dict[str, object]]() for run_number in range(num_runs): - run_data = _run_benchmark( + run_data = run_benchmark( server, bench_cmd, serve_overrides=serve_comb, @@ -188,7 +185,7 @@ def run_combs( all_data = list[dict[str, object]]() for serve_comb in serve_params: with ( - _run_server( + run_server( serve_cmd, after_bench_cmd, show_stdout=show_stdout, @@ -201,7 +198,7 @@ def run_combs( for bench_comb in bench_params: base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb) - comb_data = _run_comb( + comb_data = run_comb( server, bench_cmd, serve_comb=serve_comb, @@ -223,457 +220,150 @@ def run_combs( return combined_df -def _get_sla_base_path( - output_dir: Path, - serve_comb: ParameterSweepItem, - bench_comb: ParameterSweepItem, -): - parts = list[str]() - if serve_comb: - parts.extend(("SERVE-", serve_comb.as_text(sep="-"))) - if bench_comb: - parts.extend(("BENCH-", bench_comb.as_text(sep="-"))) - - return output_dir / sanitize_filename("-".join(parts)) - - -def _get_sla_iter_path( - base_path: Path, - sla_comb: SLASweepItem, - sla_variable: str, - sla_value: int | None, -): - if sla_value is None: - prefix = sla_comb.as_text(sep="-") - return base_path / f"SLA--{prefix}.json" - - return base_path / f"{sla_variable}={sla_value}" - - -def _get_sla_run_path(iter_path: Path, run_number: int | None): - if run_number is None: - return iter_path / "summary.json" - - return iter_path / f"run={run_number}.json" - - -def _sla_needs_server( - serve_comb: ParameterSweepItem, - bench_combs: ParameterSweep, - sla_combs: SLASweep, - sla_variable: str, - output_dir: Path, -): - for bench_comb in bench_combs: - base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) - for sla_comb in sla_combs: - if not _get_sla_iter_path( - base_path, - sla_comb, - sla_variable, - sla_value=None, - ).exists(): - return True - - return False - - -def _run_sla( - server: ServerProcess | None, - bench_cmd: list[str], - *, - serve_comb: ParameterSweepItem, - bench_comb: ParameterSweepItem, - iter_path: Path, - num_runs: int, - dry_run: bool, -): - iter_data = list[dict[str, object]]() - - for run_number in range(num_runs): - run_data = _run_benchmark( - server, - bench_cmd, - serve_overrides=serve_comb, - bench_overrides=bench_comb, - run_number=run_number, - output_path=_get_sla_run_path(iter_path, run_number), - dry_run=dry_run, - ) - - if run_data is not None: - iter_data.append(run_data) - - if dry_run: - return None - - with _get_sla_run_path(iter_path, run_number=None).open("w") as f: - json.dump(iter_data, f, indent=4) - - return iter_data - - -SLAVariable = Literal["request_rate", "max_concurrency"] - - -def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable): - request_throughput = float(run_data["request_throughput"]) # type: ignore - if sla_variable == "request_rate": - return request_throughput - if sla_variable == "max_concurrency": - mean_latency_ms = float(run_data["mean_e2el_ms"]) # type: ignore - return request_throughput * mean_latency_ms / 1000 - - assert_never(sla_variable) - - -def _estimate_sla_bounds( - server: ServerProcess | None, - bench_cmd: list[str], - *, - serve_comb: ParameterSweepItem, - bench_comb: ParameterSweepItem, - sla_comb: SLASweepItem, - base_path: Path, - num_runs: int, - dry_run: bool, - sla_variable: SLAVariable, - init_value: int, - max_value: int, -): - sla_data = list[dict[str, object]]() - - max_passing: int = 0 - min_failing: int = 0 - - val: int = init_value - assert val > 0 - - while True: - print(f"Testing {sla_variable}: {val} req/s") - - iter_data = _run_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb | {sla_variable: val}, - iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), - num_runs=num_runs, - dry_run=dry_run, +@dataclass +class SweepServeArgs: + serve_cmd: list[str] + bench_cmd: list[str] + after_bench_cmd: list[str] + show_stdout: bool + serve_params: ParameterSweep + bench_params: ParameterSweep + output_dir: Path + num_runs: int + dry_run: bool + resume: str | None + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + serve_cmd = shlex.split(args.serve_cmd) + bench_cmd = shlex.split(args.bench_cmd) + after_bench_cmd = ( + [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd) ) - assert iter_data is not None - sla_data.extend(iter_data) - - iter_data_mean = { - k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore - for k in sla_comb - } - - sla_results = [ - criterion.print_and_validate(iter_data_mean, k) - for k, criterion in sla_comb.items() - ] - - if all(sla_results): - print("SLA criteria are met.") - max_passing = val - val *= 2 + if args.serve_params: + serve_params = ParameterSweep.read_json(args.serve_params) else: - print("SLA criteria are not met.") - min_failing = val - break - - if val >= max_value: - break - - return sla_data, (max_passing, min_failing) - + # i.e.: run serve_cmd without any modification + serve_params = ParameterSweep.from_records([{}]) -def _find_sla_value( - server: ServerProcess | None, - bench_cmd: list[str], - *, - serve_comb: ParameterSweepItem, - bench_comb: ParameterSweepItem, - sla_comb: SLASweepItem, - base_path: Path, - num_runs: int, - dry_run: bool, - sla_variable: SLAVariable, - min_value: int, - max_value: int, -): - sla_data = list[dict[str, object]]() - - left: int = min_value - right: int = max_value - - while True: - val = (left + right) // 2 - print(f"Testing {sla_variable}: {val} req/s") - - iter_data = _run_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb | {sla_variable: val}, - iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), - num_runs=num_runs, - dry_run=dry_run, - ) - - assert iter_data is not None - sla_data.extend(iter_data) - - iter_data_mean = { - k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore - for k in sla_comb - } - - sla_results = [ - criterion.print_and_validate(iter_data_mean, k) - for k, criterion in sla_comb.items() - ] - - if all(sla_results): - print("SLA criteria are met.") - left = val + if args.bench_params: + bench_params = ParameterSweep.read_json(args.bench_params) else: - print("SLA criteria are not met.") - right = val - - if right - left <= 1: - break - - return sla_data, left - - -def _search_sla( - server: ServerProcess | None, - bench_cmd: list[str], - *, - serve_comb: ParameterSweepItem, - bench_comb: ParameterSweepItem, - sla_comb: SLASweepItem, - sla_variable: SLAVariable, - sla_inf_value: int = 65536, # The value that represents infinite QPS - base_path: Path, - num_runs: int, - dry_run: bool, -): - print("[SLA START]") - print(f"SLA criteria: {sla_comb.as_text()}") - - sla_data_0 = _run_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb | {sla_variable: sla_inf_value}, - iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value), - num_runs=num_runs, - dry_run=dry_run, - ) - if sla_data_0 is None: - assert dry_run - print("Omitting SLA search.") - print("[SLA END]") - return None + # i.e.: run bench_cmd without any modification + bench_params = ParameterSweep.from_records([{}]) - sla_init_value = math.ceil( - sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0) - / len(sla_data_0) - ) - print(f"Initial {sla_variable} to search: {sla_init_value} req/s.") - - sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb, - sla_comb=sla_comb, - base_path=base_path, - num_runs=num_runs, - dry_run=dry_run, - sla_variable=sla_variable, - init_value=sla_init_value, - max_value=sla_inf_value, - ) - print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.") - - sla_data_2, sla_value = _find_sla_value( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb, - sla_comb=sla_comb, - base_path=base_path, - num_runs=num_runs, - dry_run=dry_run, - sla_variable=sla_variable, - min_value=sla_min, - max_value=sla_max, - ) + num_runs = args.num_runs + if num_runs < 1: + raise ValueError("`num_runs` should be at least 1.") - sla_data = sla_data_0 + sla_data_1 + sla_data_2 - print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.") - - with _get_sla_iter_path( - base_path, - sla_comb, - sla_variable, - sla_value=None, - ).open("w") as f: - json.dump(sla_data, f, indent=4) - - print("[SLA END]") - - return sla_data - - -def run_slas( - serve_cmd: list[str], - bench_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_params: ParameterSweep, - bench_params: ParameterSweep, - sla_params: SLASweep, - sla_variable: SLAVariable, - output_dir: Path, - num_runs: int, - dry_run: bool, -): - if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params): - raise ValueError( - f"You should not override `{sla_variable}` in `bench_params` in SLA mode, " - "since it is supposed to be determined automatically." - ) - - all_data = list[dict[str, object]]() - for serve_comb in serve_params: - with ( - _run_server( - serve_cmd, - after_bench_cmd, - show_stdout=show_stdout, - serve_overrides=serve_comb, - dry_run=dry_run, - ) - if _sla_needs_server( - serve_comb, - bench_params, - sla_params, - sla_variable, - output_dir, - ) - else contextlib.nullcontext() - ) as server: - for bench_comb in bench_params: - for sla_comb in sla_params: - base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) - - comb_data = _search_sla( - server, - bench_cmd, - serve_comb=serve_comb, - bench_comb=bench_comb, - sla_comb=sla_comb, - sla_variable=sla_variable, - base_path=base_path, - num_runs=num_runs, - dry_run=dry_run, - ) - - if comb_data is not None: - all_data.extend(comb_data) - - if dry_run: - return None - - combined_df = pd.DataFrame.from_records(all_data) - combined_df.to_csv(output_dir / "summary.csv") - - return combined_df - - -def _run_main( - serve_cmd: list[str], - bench_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_params: ParameterSweep, - bench_params: ParameterSweep, - sla_params: SLASweep, - sla_variable: SLAVariable, - output_dir: Path, - num_runs: int, - dry_run: bool, -): - if sla_params: - return run_slas( + return cls( serve_cmd=serve_cmd, bench_cmd=bench_cmd, after_bench_cmd=after_bench_cmd, - show_stdout=show_stdout, + show_stdout=args.show_stdout, serve_params=serve_params, bench_params=bench_params, - sla_params=sla_params, - sla_variable=sla_variable, - output_dir=output_dir, + output_dir=Path(args.output_dir), num_runs=num_runs, - dry_run=dry_run, + dry_run=args.dry_run, + resume=args.resume, ) - return run_combs( - serve_cmd=serve_cmd, - bench_cmd=bench_cmd, - after_bench_cmd=after_bench_cmd, - show_stdout=show_stdout, - serve_params=serve_params, - bench_params=bench_params, - output_dir=output_dir, - num_runs=num_runs, - dry_run=dry_run, - ) + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser.add_argument( + "--serve-cmd", + type=str, + required=True, + help="The command used to run the server: `vllm serve ...`", + ) + parser.add_argument( + "--bench-cmd", + type=str, + required=True, + help="The command used to run the benchmark: `vllm bench serve ...`", + ) + parser.add_argument( + "--after-bench-cmd", + type=str, + default=None, + help="After a benchmark run is complete, invoke this command instead of " + "the default `ServerWrapper.clear_cache()`.", + ) + parser.add_argument( + "--show-stdout", + action="store_true", + help="If set, logs the standard output of subcommands. " + "Useful for debugging but can be quite spammy.", + ) + parser.add_argument( + "--serve-params", + type=str, + default=None, + help="Path to JSON file containing a list of parameter combinations " + "for the `vllm serve` command. " + "If both `serve_params` and `bench_params` are given, " + "this script will iterate over their Cartesian product.", + ) + parser.add_argument( + "--bench-params", + type=str, + default=None, + help="Path to JSON file containing a list of parameter combinations " + "for the `vllm bench serve` command. " + "If both `serve_params` and `bench_params` are given, " + "this script will iterate over their Cartesian product.", + ) + parser.add_argument( + "-o", + "--output-dir", + type=str, + default="results", + help="The directory to which results are written.", + ) + parser.add_argument( + "--num-runs", + type=int, + default=3, + help="Number of runs per parameter combination.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the commands to run, " + "then exits without executing them.", + ) + parser.add_argument( + "--resume", + type=str, + default=None, + help="Set this to the name of a directory under `output_dir` (which is a " + "timestamp) to resume a previous execution of this script, i.e., only run " + "parameter combinations for which there are still no output files.", + ) + return parser -def run_main( - serve_cmd: list[str], - bench_cmd: list[str], - after_bench_cmd: list[str], - *, - show_stdout: bool, - serve_params: ParameterSweep, - bench_params: ParameterSweep, - sla_params: SLASweep, - sla_variable: SLAVariable, - output_dir: Path, - num_runs: int, - dry_run: bool, - resume: str | None, -): - timestamp = resume or datetime.now().strftime("%Y%m%d_%H%M%S") - output_dir = output_dir / timestamp - if resume and not output_dir.exists(): +def run_main(args: SweepServeArgs): + timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = args.output_dir / timestamp + + if args.resume and not output_dir.exists(): raise ValueError(f"Cannot resume from non-existent directory ({output_dir})") try: - return _run_main( - serve_cmd=serve_cmd, - bench_cmd=bench_cmd, - after_bench_cmd=after_bench_cmd, - show_stdout=show_stdout, - serve_params=serve_params, - bench_params=bench_params, - sla_params=sla_params, - sla_variable=sla_variable, + return run_combs( + serve_cmd=args.serve_cmd, + bench_cmd=args.bench_cmd, + after_bench_cmd=args.after_bench_cmd, + show_stdout=args.show_stdout, + serve_params=args.serve_params, + bench_params=args.bench_params, output_dir=output_dir, - num_runs=num_runs, - dry_run=dry_run, + num_runs=args.num_runs, + dry_run=args.dry_run, ) except BaseException as exc: raise RuntimeError( @@ -682,144 +372,14 @@ def run_main( ) from exc -def main(): - parser = argparse.ArgumentParser( - description="Run vLLM server benchmark under multiple settings." - ) - parser.add_argument( - "--serve-cmd", - type=str, - required=True, - help="The command used to run the server: `vllm serve ...`", - ) - parser.add_argument( - "--bench-cmd", - type=str, - required=True, - help="The command used to run the benchmark: `vllm bench serve ...`", - ) - parser.add_argument( - "--after-bench-cmd", - type=str, - default=None, - help="After a benchmark run is complete, invoke this command instead of the " - "default `ServerWrapper.clear_cache()`.", - ) - parser.add_argument( - "--show-stdout", - action="store_true", - help="If set, logs the standard output of subcommands. " - "Useful for debugging but can be quite spammy.", - ) - parser.add_argument( - "--serve-params", - type=str, - default=None, - help="Path to JSON file containing a list of parameter combinations " - "for the `vllm serve` command. " - "If both `serve_params` and `bench_params` are given, " - "this script will iterate over their Cartesian product.", - ) - parser.add_argument( - "--bench-params", - type=str, - default=None, - help="Path to JSON file containing a list of parameter combinations " - "for the `vllm bench serve` command. " - "If both `serve_params` and `bench_params` are given, " - "this script will iterate over their Cartesian product.", - ) - parser.add_argument( - "--sla-params", - type=str, - default=None, - help="Path to JSON file containing a list of SLA constraints to satisfy. " - 'Each constraint is expressed in `{"": ""}` format, ' - 'e.g.: `{"p99_e2el_ms": "<=500"}` means that ' - "the E2E latency should be less than 500ms 99% of the time. " - "Setting this option runs this script in SLA mode, which searches for the " - "maximum `sla_variable` that satisfies the constraints for each combination " - "of `serve_params`, `bench_params`, and `sla_params`.", - ) - parser.add_argument( - "--sla-variable", - type=str, - choices=get_args(SLAVariable), - default="request_rate", - help="Whether to tune request rate or maximum concurrency to satisfy " - "the SLA constraints.", - ) - parser.add_argument( - "-o", - "--output-dir", - type=str, - default="results", - help="The directory to which results are written.", - ) - parser.add_argument( - "--num-runs", - type=int, - default=3, - help="Number of runs per parameter combination.", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="If set, prints the commands to run, then exits without executing them.", - ) - parser.add_argument( - "--resume", - type=str, - default=None, - help="Set this to the name of a directory under `output_dir` (which is a " - "timestamp) to resume a previous execution of this script, i.e., only run " - "parameter combinations for which there are still no output files.", - ) - - args = parser.parse_args() +def main(args: argparse.Namespace): + run_main(SweepServeArgs.from_cli_args(args)) - serve_cmd = shlex.split(args.serve_cmd) - bench_cmd = shlex.split(args.bench_cmd) - after_bench_cmd = ( - [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd) - ) - if args.serve_params: - serve_params = ParameterSweep.read_json(args.serve_params) - else: - # i.e.: run serve_cmd without any modification - serve_params = ParameterSweep.from_records([{}]) - - if args.bench_params: - bench_params = ParameterSweep.read_json(args.bench_params) - else: - # i.e.: run bench_cmd without any modification - bench_params = ParameterSweep.from_records([{}]) - - if args.sla_params: - sla_params = SLASweep.read_json(args.sla_params) - else: - sla_params = SLASweep.from_records([]) - - num_runs = args.num_runs - if num_runs < 1: - raise ValueError("`num_runs` should be at least 1.") - - run_main( - serve_cmd=serve_cmd, - bench_cmd=bench_cmd, - after_bench_cmd=after_bench_cmd, - show_stdout=args.show_stdout, - serve_params=serve_params, - bench_params=bench_params, - sla_params=sla_params, - sla_variable=args.sla_variable, - output_dir=Path(args.output_dir), - num_runs=num_runs, - dry_run=args.dry_run, - resume=args.resume, +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run vLLM server benchmark under multiple settings." ) + SweepServeArgs.add_cli_args(parser) - -if __name__ == "__main__": - main() + main(parser.parse_args()) diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py new file mode 100644 index 000000000000..62e2917dc22b --- /dev/null +++ b/vllm/benchmarks/sweep/serve_sla.py @@ -0,0 +1,483 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import contextlib +import json +import math +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path +from typing import Literal, get_args + +import pandas as pd +from typing_extensions import assert_never + +from .param_sweep import ParameterSweep, ParameterSweepItem +from .serve import SweepServeArgs, run_benchmark, run_server +from .server import ServerProcess +from .sla_sweep import SLASweep, SLASweepItem +from .utils import sanitize_filename + + +def _get_sla_base_path( + output_dir: Path, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, +): + parts = list[str]() + if serve_comb: + parts.extend(("SERVE-", serve_comb.as_text(sep="-"))) + if bench_comb: + parts.extend(("BENCH-", bench_comb.as_text(sep="-"))) + + return output_dir / sanitize_filename("-".join(parts)) + + +def _get_sla_iter_path( + base_path: Path, + sla_comb: SLASweepItem, + sla_variable: str, + sla_value: int | None, +): + if sla_value is None: + prefix = sla_comb.as_text(sep="-") + return base_path / f"SLA--{prefix}.json" + + return base_path / f"{sla_variable}={sla_value}" + + +def _get_sla_run_path(iter_path: Path, run_number: int | None): + if run_number is None: + return iter_path / "summary.json" + + return iter_path / f"run={run_number}.json" + + +def _sla_needs_server( + serve_comb: ParameterSweepItem, + bench_combs: ParameterSweep, + sla_combs: SLASweep, + sla_variable: str, + output_dir: Path, +): + for bench_comb in bench_combs: + base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) + for sla_comb in sla_combs: + if not _get_sla_iter_path( + base_path, + sla_comb, + sla_variable, + sla_value=None, + ).exists(): + return True + + return False + + +def run_sla( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + iter_path: Path, + num_runs: int, + dry_run: bool, +): + iter_data = list[dict[str, object]]() + + for run_number in range(num_runs): + run_data = run_benchmark( + server, + bench_cmd, + serve_overrides=serve_comb, + bench_overrides=bench_comb, + run_number=run_number, + output_path=_get_sla_run_path(iter_path, run_number), + dry_run=dry_run, + ) + + if run_data is not None: + iter_data.append(run_data) + + if dry_run: + return None + + with _get_sla_run_path(iter_path, run_number=None).open("w") as f: + json.dump(iter_data, f, indent=4) + + return iter_data + + +SLAVariable = Literal["request_rate", "max_concurrency"] + + +def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable): + request_throughput = float(run_data["request_throughput"]) # type: ignore + if sla_variable == "request_rate": + return request_throughput + if sla_variable == "max_concurrency": + mean_latency_ms = float(run_data["mean_e2el_ms"]) # type: ignore + return request_throughput * mean_latency_ms / 1000 + + assert_never(sla_variable) + + +def _estimate_sla_bounds( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, + sla_variable: SLAVariable, + init_value: int, + max_value: int, +): + sla_data = list[dict[str, object]]() + + max_passing: int = 0 + min_failing: int = 0 + + val: int = init_value + assert val > 0 + + while True: + print(f"Testing {sla_variable}: {val} req/s") + + iter_data = run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: val}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), + num_runs=num_runs, + dry_run=dry_run, + ) + + assert iter_data is not None + sla_data.extend(iter_data) + + iter_data_mean = { + k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore + for k in sla_comb + } + + sla_results = [ + criterion.print_and_validate(iter_data_mean, k) + for k, criterion in sla_comb.items() + ] + + if all(sla_results): + print("SLA criteria are met.") + max_passing = val + val *= 2 + else: + print("SLA criteria are not met.") + min_failing = val + break + + if val >= max_value: + break + + return sla_data, (max_passing, min_failing) + + +def _find_sla_value( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + base_path: Path, + num_runs: int, + dry_run: bool, + sla_variable: SLAVariable, + min_value: int, + max_value: int, +): + sla_data = list[dict[str, object]]() + + left: int = min_value + right: int = max_value + + while True: + val = (left + right) // 2 + print(f"Testing {sla_variable}: {val} req/s") + + iter_data = run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: val}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val), + num_runs=num_runs, + dry_run=dry_run, + ) + + assert iter_data is not None + sla_data.extend(iter_data) + + iter_data_mean = { + k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data) # type: ignore + for k in sla_comb + } + + sla_results = [ + criterion.print_and_validate(iter_data_mean, k) + for k, criterion in sla_comb.items() + ] + + if all(sla_results): + print("SLA criteria are met.") + left = val + else: + print("SLA criteria are not met.") + right = val + + if right - left <= 1: + break + + return sla_data, left + + +def search_sla( + server: ServerProcess | None, + bench_cmd: list[str], + *, + serve_comb: ParameterSweepItem, + bench_comb: ParameterSweepItem, + sla_comb: SLASweepItem, + sla_variable: SLAVariable, + sla_inf_value: int = 65536, # The value that represents infinite QPS + base_path: Path, + num_runs: int, + dry_run: bool, +): + print("[SLA START]") + print(f"SLA criteria: {sla_comb.as_text()}") + + sla_data_0 = run_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb | {sla_variable: sla_inf_value}, + iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value), + num_runs=num_runs, + dry_run=dry_run, + ) + if sla_data_0 is None: + assert dry_run + print("Omitting SLA search.") + print("[SLA END]") + return None + + sla_init_value = math.ceil( + sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0) + / len(sla_data_0) + ) + print(f"Initial {sla_variable} to search: {sla_init_value} req/s.") + + sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + sla_variable=sla_variable, + init_value=sla_init_value, + max_value=sla_inf_value, + ) + print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.") + + sla_data_2, sla_value = _find_sla_value( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + sla_variable=sla_variable, + min_value=sla_min, + max_value=sla_max, + ) + + sla_data = sla_data_0 + sla_data_1 + sla_data_2 + print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.") + + with _get_sla_iter_path( + base_path, + sla_comb, + sla_variable, + sla_value=None, + ).open("w") as f: + json.dump(sla_data, f, indent=4) + + print("[SLA END]") + + return sla_data + + +def run_slas( + serve_cmd: list[str], + bench_cmd: list[str], + after_bench_cmd: list[str], + *, + show_stdout: bool, + serve_params: ParameterSweep, + bench_params: ParameterSweep, + sla_params: SLASweep, + sla_variable: SLAVariable, + output_dir: Path, + num_runs: int, + dry_run: bool, +): + if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params): + raise ValueError( + f"You should not override `{sla_variable}` in `bench_params` in SLA mode, " + "since it is supposed to be determined automatically." + ) + + all_data = list[dict[str, object]]() + for serve_comb in serve_params: + with ( + run_server( + serve_cmd, + after_bench_cmd, + show_stdout=show_stdout, + serve_overrides=serve_comb, + dry_run=dry_run, + ) + if _sla_needs_server( + serve_comb, + bench_params, + sla_params, + sla_variable, + output_dir, + ) + else contextlib.nullcontext() + ) as server: + for bench_comb in bench_params: + for sla_comb in sla_params: + base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb) + + comb_data = search_sla( + server, + bench_cmd, + serve_comb=serve_comb, + bench_comb=bench_comb, + sla_comb=sla_comb, + sla_variable=sla_variable, + base_path=base_path, + num_runs=num_runs, + dry_run=dry_run, + ) + + if comb_data is not None: + all_data.extend(comb_data) + + if dry_run: + return None + + combined_df = pd.DataFrame.from_records(all_data) + combined_df.to_csv(output_dir / "summary.csv") + + return combined_df + + +@dataclass +class SweepServeSLAArgs(SweepServeArgs): + sla_params: SLASweep + sla_variable: SLAVariable + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + base_args = super().from_cli_args(args) + + if args.sla_params: + sla_params = SLASweep.read_json(args.sla_params) + else: + sla_params = SLASweep.from_records([]) + + return cls( + **asdict(base_args), + sla_params=sla_params, + sla_variable=args.sla_variable, + ) + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + parser = super().add_cli_args(parser) + + parser.add_argument( + "--sla-params", + type=str, + required=True, + help="Path to JSON file containing a list of SLA constraints to satisfy. " + 'Each constraint is expressed in `{"": ""}` format, ' + 'e.g.: `{"p99_e2el_ms": "<=500"}` means that ' + "the E2E latency should be less than 500ms 99%% of the time. " + "Setting this option runs this script in SLA mode, which searches for " + "the maximum `sla_variable` that satisfies the constraints for " + "each combination of `serve_params`, `bench_params`, and `sla_params`.", + ) + parser.add_argument( + "--sla-variable", + type=str, + choices=get_args(SLAVariable), + default="request_rate", + help="Whether to tune request rate or maximum concurrency to satisfy " + "the SLA constraints.", + ) + + return parser + + +def run_main(args: SweepServeSLAArgs): + timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = args.output_dir / timestamp + + if args.resume and not output_dir.exists(): + raise ValueError(f"Cannot resume from non-existent directory ({output_dir})") + + try: + return run_slas( + serve_cmd=args.serve_cmd, + bench_cmd=args.bench_cmd, + after_bench_cmd=args.after_bench_cmd, + show_stdout=args.show_stdout, + serve_params=args.serve_params, + bench_params=args.bench_params, + sla_params=args.sla_params, + sla_variable=args.sla_variable, + output_dir=output_dir, + num_runs=args.num_runs, + dry_run=args.dry_run, + ) + except BaseException as exc: + raise RuntimeError( + f"The script was terminated early. Use `--resume {timestamp}` " + f"to continue the script from its last checkpoint." + ) from exc + + +def main(args: argparse.Namespace): + run_main(SweepServeSLAArgs.from_cli_args(args)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Tune a variable to meet SLAs under multiple settings." + ) + SweepServeSLAArgs.add_cli_args(parser) + + main(parser.parse_args()) From 3fa0d4c10d91683813bdd7aa4776834410a292a1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 01:51:06 +0000 Subject: [PATCH 40/48] Update Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 15 +++++++++++---- vllm/benchmarks/sweep/sla_sweep.py | 8 ++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 8cdbe5980e8f..fc8098d67a2c 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -43,13 +43,19 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: raise NotImplementedError +class PlotEqualTo(PlotFilterBase): + @override + def apply(self, df: pd.DataFrame) -> pd.DataFrame: + return df[df[self.var] == self.target] + + class PlotLessThan(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] < self.target] -class PlotLessThanOrEqual(PlotFilterBase): +class PlotLessThanOrEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] <= self.target] @@ -61,7 +67,7 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] > self.target] -class PlotGreaterThanOrEqual(PlotFilterBase): +class PlotGreaterThanOrEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] >= self.target] @@ -69,8 +75,9 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: # NOTE: The ordering is important! Match longer op_keys first PLOT_FILTERS: dict[str, type[PlotFilterBase]] = { - "<=": PlotLessThanOrEqual, - ">=": PlotGreaterThanOrEqual, + "==": PlotEqualTo, + "<=": PlotLessThanOrEqualTo, + ">=": PlotGreaterThanOrEqualTo, "<": PlotLessThan, ">": PlotGreaterThan, } diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py index 6a58b7149a02..a15e165b894f 100644 --- a/vllm/benchmarks/sweep/sla_sweep.py +++ b/vllm/benchmarks/sweep/sla_sweep.py @@ -46,7 +46,7 @@ def format_cond(self, lhs: str) -> str: return f"{lhs}<{self.target:.2f}" -class SLALessThanOrEqual(SLACriterionBase): +class SLALessThanOrEqualTo(SLACriterionBase): @override def validate(self, actual: float) -> bool: return actual <= self.target @@ -66,7 +66,7 @@ def format_cond(self, lhs: str) -> str: return f"{lhs}>{self.target:.2f}" -class SLAGreaterThanOrEqual(SLACriterionBase): +class SLAGreaterThanOrEqualTo(SLACriterionBase): @override def validate(self, actual: float) -> bool: return actual >= self.target @@ -78,8 +78,8 @@ def format_cond(self, lhs: str) -> str: # NOTE: The ordering is important! Match longer op_keys first SLA_CRITERIA: dict[str, type[SLACriterionBase]] = { - "<=": SLALessThanOrEqual, - ">=": SLAGreaterThanOrEqual, + "<=": SLALessThanOrEqualTo, + ">=": SLAGreaterThanOrEqualTo, "<": SLALessThan, ">": SLAGreaterThan, } From a3d10958e2e8c9287608c40e6e22f8833011c5aa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 02:27:31 +0000 Subject: [PATCH 41/48] Improve error message Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index fc8098d67a2c..ba005723e1da 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -235,6 +235,24 @@ def _plot_fig( f"Cannot find {var_y=!r} in parameter sweep results. " f"Available variables: {df.columns.tolist()}" ) + for k in row_by: + if k not in df.columns: + raise ValueError( + f"Cannot find row_by={k!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) + for k in col_by: + if k not in df.columns: + raise ValueError( + f"Cannot find col_by={k!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) + for k in curve_by: + if k not in df.columns: + raise ValueError( + f"Cannot find curve_by={k!r} in parameter sweep results. " + f"Available variables: {df.columns.tolist()}" + ) df = filter_by.apply(df) df = bin_by.apply(df) From a4adbda5eaf42c99cc3c3aca4cab34fdb3ed635d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 02:51:06 +0000 Subject: [PATCH 42/48] Allow strings Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index ba005723e1da..8fcde751f9d3 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -24,14 +24,14 @@ def parse_str(cls, s: str): for op_key in PLOT_FILTERS: if op_key in s: key, value = s.split(op_key) - return PLOT_FILTERS[op_key](key, float(value.removeprefix(op_key))) + return PLOT_FILTERS[op_key](key, value.removeprefix(op_key)) else: raise ValueError( f"Invalid operator for plot filter '{s}'. " f"Valid operators are: {set(PLOT_FILTERS)}", ) - def __init__(self, var: str, target: float) -> None: + def __init__(self, var: str, target: str) -> None: super().__init__() self.var = var @@ -52,25 +52,25 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: class PlotLessThan(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: - return df[df[self.var] < self.target] + return df[df[self.var] < float(self.target)] class PlotLessThanOrEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: - return df[df[self.var] <= self.target] + return df[df[self.var] <= float(self.target)] class PlotGreaterThan(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: - return df[df[self.var] > self.target] + return df[df[self.var] > float(self.target)] class PlotGreaterThanOrEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: - return df[df[self.var] >= self.target] + return df[df[self.var] >= float(self.target)] # NOTE: The ordering is important! Match longer op_keys first From 6357b84d3d8f77bf620c2ef2349ef6debc1942b1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 02:57:15 +0000 Subject: [PATCH 43/48] Fix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 8fcde751f9d3..d483a2272dab 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -46,7 +46,12 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: class PlotEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: - return df[df[self.var] == self.target] + try: + target = float(self.target) + except ValueError: + target = self.target + + return df[df[self.var] == target] class PlotLessThan(PlotFilterBase): From f750fc5a4acf6a9718cf70f5ca936683e809f402 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 03:01:42 +0000 Subject: [PATCH 44/48] Fix Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 31 +++++++++++++++++------------- vllm/benchmarks/sweep/sla_sweep.py | 11 +++++++---- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index d483a2272dab..7390440075e6 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -4,6 +4,7 @@ import json from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor +from dataclasses import dataclass from functools import partial from pathlib import Path from types import TracebackType @@ -18,31 +19,33 @@ from .utils import sanitize_filename +@dataclass class PlotFilterBase(ABC): + var: str + target: str + @classmethod def parse_str(cls, s: str): for op_key in PLOT_FILTERS: if op_key in s: key, value = s.split(op_key) - return PLOT_FILTERS[op_key](key, value.removeprefix(op_key)) + return PLOT_FILTERS[op_key]( + key, + value.removeprefix(op_key).strip("'").strip('"'), + ) else: raise ValueError( f"Invalid operator for plot filter '{s}'. " f"Valid operators are: {set(PLOT_FILTERS)}", ) - def __init__(self, var: str, target: str) -> None: - super().__init__() - - self.var = var - self.target = target - @abstractmethod def apply(self, df: pd.DataFrame) -> pd.DataFrame: """Applies this filter to a DataFrame.""" raise NotImplementedError +@dataclass class PlotEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: @@ -54,24 +57,28 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] == target] +@dataclass class PlotLessThan(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] < float(self.target)] +@dataclass class PlotLessThanOrEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] <= float(self.target)] +@dataclass class PlotGreaterThan(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df[df[self.var] > float(self.target)] +@dataclass class PlotGreaterThanOrEqualTo(PlotFilterBase): @override def apply(self, df: pd.DataFrame) -> pd.DataFrame: @@ -103,7 +110,11 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame: return df +@dataclass class PlotBinner: + var: str + bin_size: float + @classmethod def parse_str(cls, s: str): for op_key in PLOT_BINNERS: @@ -116,12 +127,6 @@ def parse_str(cls, s: str): f"Valid operators are: {set(PLOT_BINNERS)}", ) - def __init__(self, var: str, bin_size: float) -> None: - super().__init__() - - self.var = var - self.bin_size = bin_size - def apply(self, df: pd.DataFrame) -> pd.DataFrame: """Applies this binner to a DataFrame.""" df = df.copy() diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py index a15e165b894f..ba7d78802e9d 100644 --- a/vllm/benchmarks/sweep/sla_sweep.py +++ b/vllm/benchmarks/sweep/sla_sweep.py @@ -3,15 +3,14 @@ import json import os from abc import ABC, abstractmethod +from dataclasses import dataclass from typing_extensions import override +@dataclass class SLACriterionBase(ABC): - def __init__(self, target: float) -> None: - super().__init__() - - self.target = target + target: float @abstractmethod def validate(self, actual: float) -> bool: @@ -36,6 +35,7 @@ def print_and_validate( return result +@dataclass class SLALessThan(SLACriterionBase): @override def validate(self, actual: float) -> bool: @@ -46,6 +46,7 @@ def format_cond(self, lhs: str) -> str: return f"{lhs}<{self.target:.2f}" +@dataclass class SLALessThanOrEqualTo(SLACriterionBase): @override def validate(self, actual: float) -> bool: @@ -56,6 +57,7 @@ def format_cond(self, lhs: str) -> str: return f"{lhs}<={self.target:.2f}" +@dataclass class SLAGreaterThan(SLACriterionBase): @override def validate(self, actual: float) -> bool: @@ -66,6 +68,7 @@ def format_cond(self, lhs: str) -> str: return f"{lhs}>{self.target:.2f}" +@dataclass class SLAGreaterThanOrEqualTo(SLACriterionBase): @override def validate(self, actual: float) -> bool: From 2d856ff3a83c81b5ec00bca3157eb9e7eb2fd9b7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 03:08:09 +0000 Subject: [PATCH 45/48] Ordering Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/plot.py | 4 ++-- vllm/benchmarks/sweep/sla_sweep.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 7390440075e6..92485c09b416 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -36,7 +36,7 @@ def parse_str(cls, s: str): else: raise ValueError( f"Invalid operator for plot filter '{s}'. " - f"Valid operators are: {set(PLOT_FILTERS)}", + f"Valid operators are: {sorted(PLOT_FILTERS)}", ) @abstractmethod @@ -124,7 +124,7 @@ def parse_str(cls, s: str): else: raise ValueError( f"Invalid operator for plot binner '{s}'. " - f"Valid operators are: {set(PLOT_BINNERS)}", + f"Valid operators are: {sorted(PLOT_BINNERS)}", ) def apply(self, df: pd.DataFrame) -> pd.DataFrame: diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py index ba7d78802e9d..327e3c7c5897 100644 --- a/vllm/benchmarks/sweep/sla_sweep.py +++ b/vllm/benchmarks/sweep/sla_sweep.py @@ -123,7 +123,7 @@ def from_record(cls, record: dict[str, str]): raise ValueError( f"Invalid operator for " f"SLA constraint '{metric_key}={metric_value}'. " - f"Valid operators are: {set(SLA_CRITERIA)}", + f"Valid operators are: {sorted(SLA_CRITERIA)}", ) return cls(sla_criteria) From e6d4c7294c53a686c51163516a04d7a428241227 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 13:27:58 +0000 Subject: [PATCH 46/48] Don't split Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/serve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index e99052247e71..c4d271a0e4d9 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -108,9 +108,9 @@ def _get_comb_base_path( ): parts = list[str]() if serve_comb: - parts.extend(("SERVE-", serve_comb.as_text(sep="-").split("-"))) + parts.extend(("SERVE-", serve_comb.as_text(sep="-"))) if bench_comb: - parts.extend(("BENCH-", bench_comb.as_text(sep="-").split("-"))) + parts.extend(("BENCH-", bench_comb.as_text(sep="-"))) return output_dir / sanitize_filename("-".join(parts)) From 46d9f19ab2ebc864d8a6ff22444214dd640170b0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 21 Oct 2025 13:28:51 +0000 Subject: [PATCH 47/48] Remove unnecessary quotes Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/benchmarks/sweep/utils.py b/vllm/benchmarks/sweep/utils.py index 5a9e7d932b59..49d7867eaf48 100644 --- a/vllm/benchmarks/sweep/utils.py +++ b/vllm/benchmarks/sweep/utils.py @@ -1,4 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project def sanitize_filename(filename: str) -> str: - return filename.replace("/", "_").replace("..", "__") + return filename.replace("/", "_").replace("..", "__").strip("'").strip('"') From ceabbc81fd265f867a212c788f4f9757a737c005 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 22 Oct 2025 03:29:44 +0000 Subject: [PATCH 48/48] Update with benchmark overrides as well Signed-off-by: DarkLight1337 --- vllm/benchmarks/sweep/serve.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index c4d271a0e4d9..6e408dac0b49 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -41,6 +41,19 @@ def run_server( print("[END SERVER]") +def _update_run_data( + run_data: dict[str, object], + serve_overrides: ParameterSweepItem, + bench_overrides: ParameterSweepItem, + run_number: int, +): + run_data["run_number"] = run_number + run_data.update(serve_overrides) + run_data.update(bench_overrides) + + return run_data + + def run_benchmark( server: ServerProcess | None, bench_cmd: list[str], @@ -73,7 +86,12 @@ def run_benchmark( with output_path.open("rb") as f: run_data = json.load(f) - return run_data + return _update_run_data( + run_data, + serve_overrides, + bench_overrides, + run_number, + ) if server is None: if not dry_run: @@ -90,8 +108,12 @@ def run_benchmark( with output_path.open("rb") as f: run_data = json.load(f) - run_data["run_number"] = run_number - run_data.update(serve_overrides) + run_data = _update_run_data( + run_data, + serve_overrides, + bench_overrides, + run_number, + ) with output_path.open("w") as f: json.dump(run_data, f, indent=4)