From b5eac5245d766d9a480503d50efb03a97c482e67 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 19 Oct 2025 08:53:55 +0000
Subject: [PATCH 01/48] [Benchmark] Add plot utility for parameter sweep

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/benchmarks.md      |  25 +-
 vllm/benchmarks/sweep/__init__.py    |   0
 vllm/benchmarks/sweep/param_sweep.py |  83 +++
 vllm/benchmarks/sweep/plot.py        | 180 ++++++
 vllm/benchmarks/sweep/serve.py       | 824 +++++++++++++++++++++++++++
 vllm/benchmarks/sweep/server.py      | 114 ++++
 vllm/benchmarks/sweep/sla_sweep.py   | 129 +++++
 7 files changed, 1349 insertions(+), 6 deletions(-)
 create mode 100644 vllm/benchmarks/sweep/__init__.py
 create mode 100644 vllm/benchmarks/sweep/param_sweep.py
 create mode 100644 vllm/benchmarks/sweep/plot.py
 create mode 100644 vllm/benchmarks/sweep/serve.py
 create mode 100644 vllm/benchmarks/sweep/server.py
 create mode 100644 vllm/benchmarks/sweep/sla_sweep.py
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 52a16d7bdbff..157241d331bb 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -7,7 +7,7 @@ toc_depth: 4
 vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
 
 - **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
-- **[Batch Scripts](#batch-scripts)**: Run `vllm bench` against multiple configurations conveniently
+- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
 - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
 - **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
 
@@ -925,11 +925,11 @@ throughput numbers correctly is also adjusted.
 
 </details>
 
-## Batch Scripts
+## Parameter Sweeps
 
-### Batch Serving Script
+### Online Benchmark
 
-[`vllm/benchmarks/serve_multi.py`](../../vllm/benchmarks/serve_multi.py) automatically starts `vllm serve` and runs `vllm bench serve` over multiple configurations.
+[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` over multiple configurations.
 
 #### Batch Mode
 
@@ -996,7 +996,7 @@ The basic purpose of this script is to evaluate vLLM under different settings. F
 Example command:
 
 ```bash
-python vllm/benchmarks/serve_multi.py \
+python vllm/benchmarks/sweep/serve.py \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
@@ -1044,7 +1044,7 @@ For example, to ensure E2E latency within different target values for 99% of req
 Example command:
 
 ```bash
-python vllm/benchmarks/serve_multi.py \
+python vllm/benchmarks/sweep/serve.py \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
@@ -1066,6 +1066,19 @@ The algorithm for adjusting the SLA variable is as follows:
 
     For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
 
+### Visualizer
+
+[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
+
+Example command:
+
+```bash
+python vllm/benchmarks/sweep/plot.py benchmarks/results/<timestamp> --curve-by api_server_count,max_num_batched_tokens --file-by random_input_len,random_output_len
+```
+
+!!! tip
+    You can use `--dry-run` to preview the commands to be run.
+
 ## Performance Benchmarks
 
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
diff --git a/vllm/benchmarks/sweep/__init__.py b/vllm/benchmarks/sweep/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
new file mode 100644
index 000000000000..bddf0b98ae29
--- /dev/null
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+from typing import Any
+
+
+class ParameterSweep(list["ParameterSweepItem"]):
+    @classmethod
+    def read_json(cls, filepath: os.PathLike):
+        with open(filepath, "rb") as f:
+            records = json.load(f)
+
+        return cls.from_records(records)
+
+    @classmethod
+    def from_records(cls, records: list[dict[str, object]]):
+        if not isinstance(records, list):
+            raise TypeError(
+                f"The parameter sweep should be a list of dictionaries, "
+                f"but found type: {type(records)}"
+            )
+
+        return cls(ParameterSweepItem.from_record(record) for record in records)
+
+
+class ParameterSweepItem(dict[str, object]):
+    @classmethod
+    def from_record(cls, record: dict[str, object]):
+        if not isinstance(record, dict):
+            raise TypeError(
+                f"Each item in the parameter sweep should be a dictionary, "
+                f"but found type: {type(record)}"
+            )
+
+        return cls(record)
+
+    def __or__(self, other: dict[str, Any]):
+        return type(self)(self | other)
+
+    # In JSON, we prefer "_"
+    def _iter_param_key_candidates(self, param_key: str):
+        yield param_key
+        yield param_key.replace("-", "_")
+        yield param_key.replace("_", "-")
+
+    # In CLI, we prefer "-"
+    def _iter_cmd_key_candidates(self, param_key: str):
+        for k in reversed(tuple(self._iter_param_key_candidates(param_key))):
+            yield "--" + k
+
+    def _normalize_cmd_key(self, param_key: str):
+        return next(self._iter_cmd_key_candidates(param_key))
+
+    def has_param(self, param_key: str) -> bool:
+        return any(k in self for k in self._iter_param_key_candidates(param_key))
+
+    def apply_to_cmd(self, cmd: list[str]) -> list[str]:
+        cmd = list(cmd)
+
+        for k, v in self.items():
+            for k_candidate in self._iter_cmd_key_candidates(k):
+                try:
+                    k_idx = cmd.index(k_candidate)
+
+                    if isinstance(v, bool):
+                        cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k)
+                    else:
+                        cmd[k_idx + 1] = str(v)
+
+                    break
+                except ValueError:
+                    continue
+            else:
+                if isinstance(v, bool):
+                    cmd.append(self._normalize_cmd_key(k if v else "no-" + k))
+                else:
+                    cmd.extend([self._normalize_cmd_key(k), str(v)])
+
+        return cmd
+
+    def as_text(self, sep: str = ", ") -> str:
+        return sep.join(f"{k}={v}" for k, v in self.items())
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
new file mode 100644
index 000000000000..6c9e2c9d6179
--- /dev/null
+++ b/vllm/benchmarks/sweep/plot.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import json
+from pathlib import Path
+
+import pandas as pd
+import seaborn as sns
+
+from vllm.utils.collections import full_groupby
+
+
+def _json_load_bytes(path: Path) -> list[dict[str, object]]:
+    with path.open("rb") as f:
+        return json.load(f)
+
+
+def _plot_fig(
+    fig_path: Path,
+    fig_data: list[dict[str, object]],
+    curve_by: list[str],
+    *,
+    var_x: str,
+    var_y: str,
+    bin_y: float,
+    dry_run: bool,
+):
+    print("[BEGIN FIGURE]")
+    print(f"Output file: {fig_path}")
+
+    if dry_run:
+        print("[END FIGURE]")
+        return
+
+    df = pd.DataFrame.from_records(fig_data)
+    df[var_y] = df[var_y] // bin_y * bin_y
+
+    if len(curve_by) <= 3:
+        hue, style, size, *_ = (*curve_by, None, None)
+        ax = sns.lineplot(
+            df,
+            x=var_x,
+            y=var_y,
+            hue=hue,
+            style=style,
+            size=size,
+            markers=True,
+        )
+    else:
+        df["params"] = df[list(curve_by)].agg("-".join, axis=1)
+        ax = sns.lineplot(
+            df,
+            x=var_x,
+            y=var_y,
+            hue="params",
+            markers=True,
+        )
+
+    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
+
+    fig = ax.get_figure()
+    assert fig is not None
+
+    fig.tight_layout()
+    fig.savefig(fig_path)
+
+    print("[END FIGURE]")
+
+
+def plot(
+    output_dir: Path,
+    fig_by: list[str],
+    curve_by: list[str],
+    *,
+    var_x: str,
+    var_y: str,
+    bin_y: float,
+    dry_run: bool,
+):
+    all_data = [
+        run_data
+        for path in output_dir.rglob("**/summary.json")
+        for run_data in _json_load_bytes(path)
+    ]
+
+    for fig_group, fig_data in full_groupby(
+        all_data,
+        key=lambda item: tuple((k, str(item[k])) for k in fig_by),
+    ):
+        fig_path = output_dir / (
+            "-".join(
+                (
+                    "FIGURE",
+                    *(f"{k}={v}" for k, v in fig_group),
+                )
+            )
+            .replace("/", "_")
+            .replace("..", "__")  # Sanitize
+            + ".png"
+        )
+
+        _plot_fig(
+            fig_path,
+            fig_data,
+            curve_by,
+            var_x=var_x,
+            var_y=var_y,
+            bin_y=bin_y,
+            dry_run=dry_run,
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Plot performance curves from parameter sweep results."
+    )
+    parser.add_argument(
+        "OUTPUT_DIR",
+        type=str,
+        default="results",
+        help="The directory containing the results to plot. "
+        "Figures will be saved to the same directory.",
+    )
+    parser.add_argument(
+        "--curve-by",
+        type=str,
+        required=True,
+        help="A comma-separated list of variables, such that a separate curve "
+        "is created for each combination of these variables.",
+    )
+    parser.add_argument(
+        "--fig-by",
+        type=str,
+        default="",
+        help="A comma-separated list of variables, such that a separate figure "
+        "is created for each combination of these variables.",
+    )
+    parser.add_argument(
+        "--var-x",
+        type=str,
+        default="request_throughput",
+        help="The variable for the x-axis.",
+    )
+    parser.add_argument(
+        "--var-y",
+        type=str,
+        default="p99_e2el_ms",
+        help="The variable for the y-axis",
+    )
+    parser.add_argument(
+        "--bin-y",
+        type=float,
+        default=1,
+        help="Points with y-axis values in the same bin are grouped togther "
+        "to reduce noise.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="If set, prints the location of the figures without drawing them.",
+    )
+
+    args = parser.parse_args()
+
+    curve_by = [] if not args.curve_by else args.curve_by.split(",")
+    fig_by = [] if not args.fig_by else args.fig_by.split(",")
+
+    plot(
+        output_dir=Path(args.OUTPUT_DIR),
+        fig_by=fig_by,
+        curve_by=curve_by,
+        var_x=args.var_x,
+        var_y=args.var_y,
+        bin_y=args.bin_y,
+        dry_run=args.dry_run,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
new file mode 100644
index 000000000000..960c229c4999
--- /dev/null
+++ b/vllm/benchmarks/sweep/serve.py
@@ -0,0 +1,824 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import contextlib
+import json
+import math
+import shlex
+from datetime import datetime
+from pathlib import Path
+from typing import Literal, get_args
+
+import pandas as pd
+from typing_extensions import assert_never
+
+from .param_sweep import ParameterSweep, ParameterSweepItem
+from .server import ServerProcess
+from .sla_sweep import SLASweep, SLASweepItem
+
+
+@contextlib.contextmanager
+def _run_server(
+    serve_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_overrides: ParameterSweepItem,
+    dry_run: bool,
+):
+    server_cmd = serve_overrides.apply_to_cmd(serve_cmd)
+
+    print("[BEGIN SERVER]")
+    print(f"Server overrides: {serve_overrides}")
+    print(f"Server command: {server_cmd}")
+
+    if dry_run:
+        yield None
+        print("[END SERVER]")
+        return
+
+    with ServerProcess(server_cmd, after_bench_cmd, show_stdout=show_stdout) as server:
+        yield server
+
+    print("[END SERVER]")
+
+
+def _run_benchmark(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_overrides: ParameterSweepItem,
+    bench_overrides: ParameterSweepItem,
+    run_number: int,
+    output_path: Path,
+    dry_run: bool,
+):
+    benchmark_cmd = [
+        *bench_overrides.apply_to_cmd(bench_cmd),
+        "--save-result",
+        "--result-dir",
+        str(output_path.parent),
+        "--result-filename",
+        output_path.name,
+    ]
+
+    print("[BEGIN BENCHMARK]")
+    print(f"Benchmark overrides: {bench_overrides}")
+    print(f"Run Number: {run_number}")
+    print(f"Benchmark command: {benchmark_cmd}")
+    print(f"Output file: {output_path}")
+
+    run_data: dict[str, object]
+
+    if output_path.exists():
+        print("Found existing results. Skipping.")
+
+        with output_path.open("rb") as f:
+            run_data = json.load(f)
+            return run_data
+
+    if server is None:
+        assert dry_run
+        print("[END BENCHMARK]")
+        return None
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    server.run_subcommand(benchmark_cmd)
+    server.after_bench()
+
+    with output_path.open("rb") as f:
+        run_data = json.load(f)
+
+    run_data["run_number"] = run_number
+    run_data.update(serve_overrides)
+
+    with output_path.open("w") as f:
+        json.dump(run_data, f, indent=4)
+
+    print("[END BENCHMARK]")
+
+    return run_data
+
+
+def _get_comb_base_path(
+    output_dir: Path,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+):
+    return output_dir / "-".join(
+        (
+            "SERVE",
+            serve_comb.as_text(sep="-"),
+            "BENCH",
+            bench_comb.as_text(sep="-"),
+        )
+    ).replace("/", "_").replace("..", "__")  # Sanitize
+
+
+def _get_comb_run_path(base_path: Path, run_number: int | None):
+    if run_number is None:
+        return base_path / "summary.json"
+
+    return base_path / f"run={run_number}.json"
+
+
+def _comb_needs_server(
+    serve_comb: ParameterSweepItem,
+    bench_combs: ParameterSweep,
+    output_dir: Path,
+):
+    for bench_comb in bench_combs:
+        base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
+        if not _get_comb_run_path(base_path, run_number=None).exists():
+            return True
+
+    return False
+
+
+def _run_comb(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    comb_data = list[dict[str, object]]()
+
+    for run_number in range(num_runs):
+        run_data = _run_benchmark(
+            server,
+            bench_cmd,
+            serve_overrides=serve_comb,
+            bench_overrides=bench_comb,
+            run_number=run_number,
+            output_path=_get_comb_run_path(base_path, run_number),
+            dry_run=dry_run,
+        )
+
+        if run_data is not None:
+            comb_data.append(run_data)
+
+    if dry_run:
+        return None
+
+    with _get_comb_run_path(base_path, run_number=None).open("w") as f:
+        json.dump(comb_data, f, indent=4)
+
+    return comb_data
+
+
+def run_combs(
+    serve_cmd: list[str],
+    bench_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_params: ParameterSweep,
+    bench_params: ParameterSweep,
+    output_dir: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    all_data = list[dict[str, object]]()
+    for serve_comb in serve_params:
+        with (
+            _run_server(
+                serve_cmd,
+                after_bench_cmd,
+                show_stdout=show_stdout,
+                serve_overrides=serve_comb,
+                dry_run=dry_run,
+            )
+            if _comb_needs_server(serve_comb, bench_params, output_dir)
+            else contextlib.nullcontext()
+        ) as server:
+            for bench_comb in bench_params:
+                base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
+
+                comb_data = _run_comb(
+                    server,
+                    bench_cmd,
+                    serve_comb=serve_comb,
+                    bench_comb=bench_comb,
+                    base_path=base_path,
+                    num_runs=num_runs,
+                    dry_run=dry_run,
+                )
+
+                if comb_data is not None:
+                    all_data.extend(comb_data)
+
+    if dry_run:
+        return None
+
+    combined_df = pd.DataFrame.from_records(all_data)
+    combined_df.to_csv(output_dir / "summary.csv")
+
+    return combined_df
+
+
+def _get_sla_base_path(
+    output_dir: Path,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+):
+    return output_dir / "-".join(
+        (
+            "SERVE",
+            serve_comb.as_text(sep="-"),
+            "BENCH",
+            bench_comb.as_text(sep="-"),
+        )
+    ).replace("/", "_").replace("..", "__")  # Sanitize
+
+
+def _get_sla_iter_path(
+    base_path: Path,
+    sla_comb: SLASweepItem,
+    sla_variable: str,
+    sla_value: int | None,
+):
+    if sla_value is None:
+        prefix = sla_comb.as_text(sep="-")
+        return base_path / f"SLA-{prefix}.json"
+
+    return base_path / f"{sla_variable}={sla_value}"
+
+
+def _get_sla_run_path(iter_path: Path, run_number: int | None):
+    if run_number is None:
+        return iter_path / "summary.json"
+
+    return iter_path / f"run={run_number}.json"
+
+
+def _sla_needs_server(
+    serve_comb: ParameterSweepItem,
+    bench_combs: ParameterSweep,
+    sla_combs: SLASweep,
+    sla_variable: str,
+    output_dir: Path,
+):
+    for bench_comb in bench_combs:
+        base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
+        for sla_comb in sla_combs:
+            if not _get_sla_iter_path(
+                base_path,
+                sla_comb,
+                sla_variable,
+                sla_value=None,
+            ).exists():
+                return True
+
+    return False
+
+
+def _run_sla(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    iter_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    iter_data = list[dict[str, object]]()
+
+    for run_number in range(num_runs):
+        run_data = _run_benchmark(
+            server,
+            bench_cmd,
+            serve_overrides=serve_comb,
+            bench_overrides=bench_comb,
+            run_number=run_number,
+            output_path=_get_sla_run_path(iter_path, run_number),
+            dry_run=dry_run,
+        )
+
+        if run_data is not None:
+            iter_data.append(run_data)
+
+    if dry_run:
+        return None
+
+    with _get_sla_run_path(iter_path, run_number=None).open("w") as f:
+        json.dump(iter_data, f, indent=4)
+
+    return iter_data
+
+
+SLAVariable = Literal["request_rate", "max_concurrency"]
+
+
+def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
+    request_throughput = float(run_data["request_throughput"])  # type: ignore
+    if sla_variable == "request_rate":
+        return request_throughput
+    if sla_variable == "max_concurrency":
+        mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
+        return request_throughput * mean_latency_ms / 1000
+
+    assert_never(sla_variable)
+
+
+def _estimate_sla_bounds(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+    sla_variable: SLAVariable,
+    init_value: int,
+    max_value: int,
+):
+    sla_data = list[dict[str, object]]()
+
+    max_passing: int = 0
+    min_failing: int = 0
+
+    val: int = init_value
+    assert val > 0
+
+    while True:
+        print(f"Testing {sla_variable}: {val} req/s")
+
+        iter_data = _run_sla(
+            server,
+            bench_cmd,
+            serve_comb=serve_comb,
+            bench_comb=bench_comb | {sla_variable: val},
+            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+
+        assert iter_data is not None
+        sla_data.extend(iter_data)
+
+        iter_data_mean = {
+            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
+            for k in sla_comb
+        }
+
+        sla_results = [
+            criterion.print_and_validate(iter_data_mean, k)
+            for k, criterion in sla_comb.items()
+        ]
+
+        if all(sla_results):
+            print("SLA criteria are met.")
+            max_passing = val
+            val *= 2
+        else:
+            print("SLA criteria are not met.")
+            min_failing = val
+            break
+
+        if val >= max_value:
+            break
+
+    return sla_data, (max_passing, min_failing)
+
+
+def _find_sla_value(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+    sla_variable: SLAVariable,
+    min_value: int,
+    max_value: int,
+):
+    sla_data = list[dict[str, object]]()
+
+    left: int = min_value
+    right: int = max_value
+
+    while True:
+        val = (left + right) // 2
+        print(f"Testing {sla_variable}: {val} req/s")
+
+        iter_data = _run_sla(
+            server,
+            bench_cmd,
+            serve_comb=serve_comb,
+            bench_comb=bench_comb | {sla_variable: val},
+            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+
+        assert iter_data is not None
+        sla_data.extend(iter_data)
+
+        iter_data_mean = {
+            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
+            for k in sla_comb
+        }
+
+        sla_results = [
+            criterion.print_and_validate(iter_data_mean, k)
+            for k, criterion in sla_comb.items()
+        ]
+
+        if all(sla_results):
+            print("SLA criteria are met.")
+            left = val
+        else:
+            print("SLA criteria are not met.")
+            right = val
+
+        if right - left <= 1:
+            break
+
+    return sla_data, left
+
+
+def _search_sla(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    sla_variable: SLAVariable,
+    sla_inf_value: int = 65536,  # The value that represents infinite QPS
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    print("[SLA START]")
+    print(f"SLA criteria: {sla_comb.as_text()}")
+
+    sla_data_0 = _run_sla(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb | {sla_variable: sla_inf_value},
+        iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value),
+        num_runs=num_runs,
+        dry_run=dry_run,
+    )
+    if sla_data_0 is None:
+        assert dry_run
+        print("Omitting SLA search.")
+        print("[SLA END]")
+        return None
+
+    sla_init_value = math.ceil(
+        sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0)
+        / len(sla_data_0)
+    )
+    print(f"Initial {sla_variable} to search: {sla_init_value} req/s.")
+
+    sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb,
+        sla_comb=sla_comb,
+        base_path=base_path,
+        num_runs=num_runs,
+        dry_run=dry_run,
+        sla_variable=sla_variable,
+        init_value=sla_init_value,
+        max_value=sla_inf_value,
+    )
+    print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.")
+
+    sla_data_2, sla_value = _find_sla_value(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb,
+        sla_comb=sla_comb,
+        base_path=base_path,
+        num_runs=num_runs,
+        dry_run=dry_run,
+        sla_variable=sla_variable,
+        min_value=sla_min,
+        max_value=sla_max,
+    )
+
+    sla_data = sla_data_0 + sla_data_1 + sla_data_2
+    print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.")
+
+    with _get_sla_iter_path(
+        base_path,
+        sla_comb,
+        sla_variable,
+        sla_value=None,
+    ).open("w") as f:
+        json.dump(sla_data, f, indent=4)
+
+    print("[SLA END]")
+
+    return sla_data
+
+
+def run_slas(
+    serve_cmd: list[str],
+    bench_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_params: ParameterSweep,
+    bench_params: ParameterSweep,
+    sla_params: SLASweep,
+    sla_variable: SLAVariable,
+    output_dir: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params):
+        raise ValueError(
+            f"You should not override `{sla_variable}` in `bench_params` in SLA mode, "
+            "since it is supposed to be determined automatically."
+        )
+
+    all_data = list[dict[str, object]]()
+    for serve_comb in serve_params:
+        with (
+            _run_server(
+                serve_cmd,
+                after_bench_cmd,
+                show_stdout=show_stdout,
+                serve_overrides=serve_comb,
+                dry_run=dry_run,
+            )
+            if _sla_needs_server(
+                serve_comb,
+                bench_params,
+                sla_params,
+                sla_variable,
+                output_dir,
+            )
+            else contextlib.nullcontext()
+        ) as server:
+            for bench_comb in bench_params:
+                for sla_comb in sla_params:
+                    base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
+
+                    comb_data = _search_sla(
+                        server,
+                        bench_cmd,
+                        serve_comb=serve_comb,
+                        bench_comb=bench_comb,
+                        sla_comb=sla_comb,
+                        sla_variable=sla_variable,
+                        base_path=base_path,
+                        num_runs=num_runs,
+                        dry_run=dry_run,
+                    )
+
+                    if comb_data is not None:
+                        all_data.extend(comb_data)
+
+    if dry_run:
+        return None
+
+    combined_df = pd.DataFrame.from_records(all_data)
+    combined_df.to_csv(output_dir / "summary.csv")
+
+    return combined_df
+
+
+def _run_main(
+    serve_cmd: list[str],
+    bench_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_params: ParameterSweep,
+    bench_params: ParameterSweep,
+    sla_params: SLASweep,
+    sla_variable: SLAVariable,
+    output_dir: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    if sla_params:
+        return run_slas(
+            serve_cmd=serve_cmd,
+            bench_cmd=bench_cmd,
+            after_bench_cmd=after_bench_cmd,
+            show_stdout=show_stdout,
+            serve_params=serve_params,
+            bench_params=bench_params,
+            sla_params=sla_params,
+            sla_variable=sla_variable,
+            output_dir=output_dir,
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+
+    return run_combs(
+        serve_cmd=serve_cmd,
+        bench_cmd=bench_cmd,
+        after_bench_cmd=after_bench_cmd,
+        show_stdout=show_stdout,
+        serve_params=serve_params,
+        bench_params=bench_params,
+        output_dir=output_dir,
+        num_runs=num_runs,
+        dry_run=dry_run,
+    )
+
+
+def run_main(
+    serve_cmd: list[str],
+    bench_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_params: ParameterSweep,
+    bench_params: ParameterSweep,
+    sla_params: SLASweep,
+    sla_variable: SLAVariable,
+    output_dir: Path,
+    num_runs: int,
+    dry_run: bool,
+    resume: str | None,
+):
+    timestamp = resume or datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = output_dir / timestamp
+
+    if resume and not output_dir.exists():
+        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
+
+    try:
+        return _run_main(
+            serve_cmd=serve_cmd,
+            bench_cmd=bench_cmd,
+            after_bench_cmd=after_bench_cmd,
+            show_stdout=show_stdout,
+            serve_params=serve_params,
+            bench_params=bench_params,
+            sla_params=sla_params,
+            sla_variable=sla_variable,
+            output_dir=output_dir,
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+    except BaseException as exc:
+        raise RuntimeError(
+            f"The script was terminated early. Use `--resume {timestamp}` "
+            f"to continue the script from its last checkpoint."
+        ) from exc
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run vLLM server benchmark under multiple settings."
+    )
+    parser.add_argument(
+        "--serve-cmd",
+        type=str,
+        required=True,
+        help="The command used to run the server: `vllm serve ...`",
+    )
+    parser.add_argument(
+        "--bench-cmd",
+        type=str,
+        required=True,
+        help="The command used to run the benchmark: `vllm bench serve ...`",
+    )
+    parser.add_argument(
+        "--after-bench-cmd",
+        type=str,
+        default=None,
+        help="After a benchmark run is complete, invoke this command instead of the "
+        "default `ServerWrapper.clear_cache()`.",
+    )
+    parser.add_argument(
+        "--show-stdout",
+        action="store_true",
+        help="If set, logs the standard output of subcommands. "
+        "Useful for debugging but can be quite spammy.",
+    )
+    parser.add_argument(
+        "--serve-params",
+        type=str,
+        default=None,
+        help="Path to JSON file containing a list of parameter combinations "
+        "for the `vllm serve` command. "
+        "If both `serve_params` and `bench_params` are given, "
+        "this script will iterate over their Cartesian product.",
+    )
+    parser.add_argument(
+        "--bench-params",
+        type=str,
+        default=None,
+        help="Path to JSON file containing a list of parameter combinations "
+        "for the `vllm bench serve` command. "
+        "If both `serve_params` and `bench_params` are given, "
+        "this script will iterate over their Cartesian product.",
+    )
+    parser.add_argument(
+        "--sla-params",
+        type=str,
+        default=None,
+        help="Path to JSON file containing a list of SLA constraints to satisfy. "
+        'Each constraint is expressed in `{"<KEY>": "<OP><VALUE>"}` format, '
+        'e.g.: `{"p99_e2el_ms": "<=500"}` means that '
+        "the E2E latency should be less than 500ms 99% of the time. "
+        "Setting this option runs this script in SLA mode, which searches for the "
+        "maximum `sla_variable` that satisfies the constraints for each combination "
+        "of `serve_params`, `bench_params`, and `sla_params`.",
+    )
+    parser.add_argument(
+        "--sla-variable",
+        type=str,
+        choices=get_args(SLAVariable),
+        default="request_rate",
+        help="Whether to tune request rate or maximum concurrency to satisfy "
+        "the SLA constraints.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        type=str,
+        default="results",
+        help="The directory to which results are written.",
+    )
+    parser.add_argument(
+        "--num-runs",
+        type=int,
+        default=3,
+        help="Number of runs per parameter combination.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="If set, prints the commands to run then exits without running them.",
+    )
+    parser.add_argument(
+        "--resume",
+        type=str,
+        default=None,
+        help="Set this to the name of a directory under `output_dir` (which is a "
+        "timestamp) to resume a previous execution of this script, i.e., only run "
+        "parameter combinations for which there are still no output files.",
+    )
+
+    args = parser.parse_args()
+
+    serve_cmd = shlex.split(args.serve_cmd)
+    bench_cmd = shlex.split(args.bench_cmd)
+    after_bench_cmd = (
+        [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd)
+    )
+
+    if args.serve_params:
+        serve_params = ParameterSweep.read_json(args.serve_params)
+    else:
+        # i.e.: run serve_cmd without any modification
+        serve_params = ParameterSweep.from_records([{}])
+
+    if args.bench_params:
+        bench_params = ParameterSweep.read_json(args.bench_params)
+    else:
+        # i.e.: run bench_cmd without any modification
+        bench_params = ParameterSweep.from_records([{}])
+
+    if args.sla_params:
+        sla_params = SLASweep.read_json(args.sla_params)
+    else:
+        sla_params = SLASweep.from_records([])
+
+    num_runs = args.num_runs
+    if num_runs < 1:
+        raise ValueError("`num_runs` should be at least 1.")
+
+    run_main(
+        serve_cmd=serve_cmd,
+        bench_cmd=bench_cmd,
+        after_bench_cmd=after_bench_cmd,
+        show_stdout=args.show_stdout,
+        serve_params=serve_params,
+        bench_params=bench_params,
+        sla_params=sla_params,
+        sla_variable=args.sla_variable,
+        output_dir=Path(args.output_dir),
+        num_runs=num_runs,
+        dry_run=args.dry_run,
+        resume=args.resume,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/benchmarks/sweep/server.py b/vllm/benchmarks/sweep/server.py
new file mode 100644
index 000000000000..f17578726415
--- /dev/null
+++ b/vllm/benchmarks/sweep/server.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import os
+import signal
+import subprocess
+from types import TracebackType
+
+import requests
+from typing_extensions import Self
+
+
+class ServerProcess:
+    def __init__(
+        self,
+        server_cmd: list[str],
+        after_bench_cmd: list[str],
+        *,
+        show_stdout: bool,
+    ) -> None:
+        super().__init__()
+
+        self.server_cmd = server_cmd
+        self.after_bench_cmd = after_bench_cmd
+        self.show_stdout = show_stdout
+
+    def __enter__(self) -> Self:
+        self.start()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        self.stop()
+
+    def start(self):
+        # Create new process for clean termination
+        self._server_process = subprocess.Popen(
+            self.server_cmd,
+            start_new_session=True,
+            stdout=None if self.show_stdout else subprocess.DEVNULL,
+            # Need `VLLM_SERVER_DEV_MODE=1` for `_reset_caches`
+            env=os.environ | {"VLLM_SERVER_DEV_MODE": "1"},
+        )
+
+    def stop(self):
+        server_process = self._server_process
+
+        if server_process.poll() is None:
+            # In case only some processes have been terminated
+            with contextlib.suppress(ProcessLookupError):
+                # We need to kill both API Server and Engine processes
+                os.killpg(os.getpgid(server_process.pid), signal.SIGKILL)
+
+    def run_subcommand(self, cmd: list[str]):
+        return subprocess.run(
+            cmd,
+            stdout=None if self.show_stdout else subprocess.DEVNULL,
+            check=True,
+        )
+
+    def after_bench(self) -> None:
+        if not self.after_bench_cmd:
+            self.reset_caches()
+            return
+
+        self.run_subcommand(self.after_bench_cmd)
+
+    def _get_vllm_server_address(self) -> str:
+        server_cmd = self.server_cmd
+
+        for host_key in ("--host",):
+            if host_key in server_cmd:
+                host = server_cmd[server_cmd.index(host_key) + 1]
+                break
+        else:
+            host = "localhost"
+
+        for port_key in ("-p", "--port"):
+            if port_key in server_cmd:
+                port = int(server_cmd[server_cmd.index(port_key) + 1])
+                break
+        else:
+            port = 8000  # The default value in vllm serve
+
+        return f"http://{host}:{port}"
+
+    def reset_caches(self) -> None:
+        server_cmd = self.server_cmd
+
+        # Use `.endswith()` to match `/bin/...`
+        if server_cmd[0].endswith("vllm"):
+            server_address = self._get_vllm_server_address()
+            print(f"Resetting caches at {server_address}")
+
+            res = requests.post(f"{server_address}/reset_prefix_cache")
+            res.raise_for_status()
+
+            res = requests.post(f"{server_address}/reset_mm_cache")
+            res.raise_for_status()
+        elif server_cmd[0].endswith("infinity_emb"):
+            if "--vector-disk-cache" in server_cmd:
+                raise NotImplementedError(
+                    "Infinity server uses caching but does not expose a method "
+                    "to reset the cache"
+                )
+        else:
+            raise NotImplementedError(
+                f"No implementation of `reset_caches` for `{server_cmd[0]}` server. "
+                "Please specify a custom command via `--after-bench-cmd`."
+            )
diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py
new file mode 100644
index 000000000000..6a58b7149a02
--- /dev/null
+++ b/vllm/benchmarks/sweep/sla_sweep.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+from abc import ABC, abstractmethod
+
+from typing_extensions import override
+
+
+class SLACriterionBase(ABC):
+    def __init__(self, target: float) -> None:
+        super().__init__()
+
+        self.target = target
+
+    @abstractmethod
+    def validate(self, actual: float) -> bool:
+        """Return `True` if this criterion is met; otherwise `False`."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def format_cond(self, lhs: str) -> str:
+        raise NotImplementedError
+
+    def print_and_validate(
+        self,
+        metrics: dict[str, float],
+        metrics_key: str,
+    ) -> bool:
+        metric = metrics[metrics_key]
+        result = self.validate(metric)
+
+        cond = self.format_cond(f"{metrics_key} = {metric:.2f}")
+        print(f"Validating SLA: {cond} | " + ("PASSED" if result else "FAILED"))
+
+        return result
+
+
+class SLALessThan(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual < self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}<{self.target:.2f}"
+
+
+class SLALessThanOrEqual(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual <= self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}<={self.target:.2f}"
+
+
+class SLAGreaterThan(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual > self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}>{self.target:.2f}"
+
+
+class SLAGreaterThanOrEqual(SLACriterionBase):
+    @override
+    def validate(self, actual: float) -> bool:
+        return actual >= self.target
+
+    @override
+    def format_cond(self, lhs: str) -> str:
+        return f"{lhs}>={self.target:.2f}"
+
+
+# NOTE: The ordering is important! Match longer op_keys first
+SLA_CRITERIA: dict[str, type[SLACriterionBase]] = {
+    "<=": SLALessThanOrEqual,
+    ">=": SLAGreaterThanOrEqual,
+    "<": SLALessThan,
+    ">": SLAGreaterThan,
+}
+
+
+class SLASweep(list["SLASweepItem"]):
+    @classmethod
+    def read_json(cls, filepath: os.PathLike):
+        with open(filepath, "rb") as f:
+            records = json.load(f)
+
+        return cls.from_records(records)
+
+    @classmethod
+    def from_records(cls, records: list[dict[str, str]]):
+        if not isinstance(records, list):
+            raise TypeError(
+                f"The SLA sweep should be a list of dictionaries, "
+                f"but found type: {type(records)}"
+            )
+
+        return cls(SLASweepItem.from_record(record) for record in records)
+
+
+class SLASweepItem(dict[str, SLACriterionBase]):
+    @classmethod
+    def from_record(cls, record: dict[str, str]):
+        sla_criteria: dict[str, SLACriterionBase] = {}
+
+        for metric_key, metric_value in record.items():
+            for op_key in SLA_CRITERIA:
+                if metric_value.startswith(op_key):
+                    sla_criteria[metric_key] = SLA_CRITERIA[op_key](
+                        float(metric_value.removeprefix(op_key))
+                    )
+                    break
+            else:
+                raise ValueError(
+                    f"Invalid operator for "
+                    f"SLA constraint '{metric_key}={metric_value}'. "
+                    f"Valid operators are: {set(SLA_CRITERIA)}",
+                )
+
+        return cls(sla_criteria)
+
+    def as_text(self, sep: str = ", ") -> str:
+        return sep.join(v.format_cond(k) for k, v in self.items())

From 7f93c36d6bea4c04ab9f78133f31fd3f1202d295 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 03:45:03 +0000
Subject: [PATCH 02/48] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 6c9e2c9d6179..2cc8369bfef6 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -22,7 +22,8 @@ def _plot_fig(
     *,
     var_x: str,
     var_y: str,
-    bin_y: float,
+    max_x: float | None,
+    bin_x: float | None,
     dry_run: bool,
 ):
     print("[BEGIN FIGURE]")
@@ -33,7 +34,12 @@ def _plot_fig(
         return
 
     df = pd.DataFrame.from_records(fig_data)
-    df[var_y] = df[var_y] // bin_y * bin_y
+
+    if max_x is not None:
+        df = df[df[var_x] <= max_x]
+
+    if bin_x is not None:
+        df[var_x] = df[var_x] // bin_x * bin_x
 
     if len(curve_by) <= 3:
         hue, style, size, *_ = (*curve_by, None, None)
@@ -74,7 +80,8 @@ def plot(
     *,
     var_x: str,
     var_y: str,
-    bin_y: float,
+    max_x: float | None,
+    bin_x: float | None,
     dry_run: bool,
 ):
     all_data = [
@@ -105,7 +112,8 @@ def plot(
             curve_by,
             var_x=var_x,
             var_y=var_y,
-            bin_y=bin_y,
+            max_x=max_x,
+            bin_x=bin_x,
             dry_run=dry_run,
         )
 
@@ -148,10 +156,16 @@ def main():
         help="The variable for the y-axis",
     )
     parser.add_argument(
-        "--bin-y",
+        "--max-x",
+        type=float,
+        default=None,
+        help="The maximum value to plot for the x-axis.",
+    )
+    parser.add_argument(
+        "--bin-x",
         type=float,
-        default=1,
-        help="Points with y-axis values in the same bin are grouped togther "
+        default=None,
+        help="Group together points with x-axis values in the same bin "
         "to reduce noise.",
     )
     parser.add_argument(
@@ -171,7 +185,8 @@ def main():
         curve_by=curve_by,
         var_x=args.var_x,
         var_y=args.var_y,
-        bin_y=args.bin_y,
+        max_x=args.max_x,
+        bin_x=args.bin_x,
         dry_run=args.dry_run,
     )
 

From d52e9b9dce8ec58d5662b494f9d6af21cdd5f371 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 03:59:47 +0000
Subject: [PATCH 03/48] Add log plot

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 2cc8369bfef6..684ee38c00be 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -24,6 +24,7 @@ def _plot_fig(
     var_y: str,
     max_x: float | None,
     bin_x: float | None,
+    log_y: bool,
     dry_run: bool,
 ):
     print("[BEGIN FIGURE]")
@@ -62,6 +63,9 @@ def _plot_fig(
             markers=True,
         )
 
+    if log_y:
+        ax.set_yscale("log")
+
     sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
 
     fig = ax.get_figure()
@@ -82,6 +86,7 @@ def plot(
     var_y: str,
     max_x: float | None,
     bin_x: float | None,
+    log_y: bool,
     dry_run: bool,
 ):
     all_data = [
@@ -114,6 +119,7 @@ def plot(
             var_y=var_y,
             max_x=max_x,
             bin_x=bin_x,
+            log_y=log_y,
             dry_run=dry_run,
         )
 
@@ -168,6 +174,11 @@ def main():
         help="Group together points with x-axis values in the same bin "
         "to reduce noise.",
     )
+    parser.add_argument(
+        "--log-y",
+        action="store_true",
+        help="Use logarithmic scaling for the y-axis.",
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -187,6 +198,7 @@ def main():
         var_y=args.var_y,
         max_x=args.max_x,
         bin_x=args.bin_x,
+        log_y=args.log_y,
         dry_run=args.dry_run,
     )
 

From 2f96852ccb97a545dd6e101e5a7278d174786100 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 04:20:51 +0000
Subject: [PATCH 04/48] Fix multifigure

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 684ee38c00be..5905807baa61 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -4,6 +4,7 @@
 import json
 from pathlib import Path
 
+import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 
@@ -73,6 +74,7 @@ def _plot_fig(
 
     fig.tight_layout()
     fig.savefig(fig_path)
+    plt.close(fig)
 
     print("[END FIGURE]")
 

From fcf156b3cea9460805c3ab5c75dd2c10a24153aa Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 04:27:36 +0000
Subject: [PATCH 05/48] Update command

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/benchmarks.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 157241d331bb..ae8582113f26 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1073,7 +1073,11 @@ The algorithm for adjusting the SLA variable is as follows:
 Example command:
 
 ```bash
-python vllm/benchmarks/sweep/plot.py benchmarks/results/<timestamp> --curve-by api_server_count,max_num_batched_tokens --file-by random_input_len,random_output_len
+python vllm/benchmarks/sweep/plot.py benchmarks/results/<timestamp> \
+    --var-x max_concurrency \
+    --max-x 1024 \
+    --curve-by api_server_count,max_num_batched_tokens \
+    --file-by random_input_len,random_output_len
 ```
 
 !!! tip

From ad14a53c4d3a11da55aca756c37ab3921815812e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 04:32:22 +0000
Subject: [PATCH 06/48] Add title

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 5905807baa61..fc4a94d26ee1 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -18,6 +18,7 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]:
 
 def _plot_fig(
     fig_path: Path,
+    fig_title: str,
     fig_data: list[dict[str, object]],
     curve_by: list[str],
     *,
@@ -64,6 +65,8 @@ def _plot_fig(
             markers=True,
         )
 
+    ax.set_title(fig_title)
+
     if log_y:
         ax.set_yscale("log")
 
@@ -101,6 +104,8 @@ def plot(
         all_data,
         key=lambda item: tuple((k, str(item[k])) for k in fig_by),
     ):
+        fig_group = tuple(fig_group)
+
         fig_path = output_dir / (
             "-".join(
                 (
@@ -112,9 +117,13 @@ def plot(
             .replace("..", "__")  # Sanitize
             + ".png"
         )
+        fig_title = (
+            ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)"
+        )
 
         _plot_fig(
             fig_path,
+            fig_title,
             fig_data,
             curve_by,
             var_x=var_x,

From 08fab86ad8bd4809f0d90faab428619e54a92c80 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 04:41:39 +0000
Subject: [PATCH 07/48] Support file prefix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index fc4a94d26ee1..bbdc534fcfaf 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -92,6 +92,7 @@ def plot(
     max_x: float | None,
     bin_x: float | None,
     log_y: bool,
+    file_prefix: str,
     dry_run: bool,
 ):
     all_data = [
@@ -107,7 +108,8 @@ def plot(
         fig_group = tuple(fig_group)
 
         fig_path = output_dir / (
-            "-".join(
+            file_prefix
+            + "-".join(
                 (
                     "FIGURE",
                     *(f"{k}={v}" for k, v in fig_group),
@@ -190,6 +192,13 @@ def main():
         action="store_true",
         help="Use logarithmic scaling for the y-axis.",
     )
+    parser.add_argument(
+        "--file-prefix",
+        type=str,
+        default="",
+        help="If set, prepends this to the filename of the saved figures to "
+        "distinguish them from other runs of this script.",
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -210,6 +219,7 @@ def main():
         max_x=args.max_x,
         bin_x=args.bin_x,
         log_y=args.log_y,
+        file_prefix=args.file_prefix,
         dry_run=args.dry_run,
     )
 

From bc04f307b0d78accb92f7d583171fd13647a1889 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 04:42:22 +0000
Subject: [PATCH 08/48] Separate

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index bbdc534fcfaf..51a8305ac830 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -109,6 +109,7 @@ def plot(
 
         fig_path = output_dir / (
             file_prefix
+            + ("--" if file_prefix else "")
             + "-".join(
                 (
                     "FIGURE",

From 1e13493a83eb324db92b3ef9502477c7675d4cc7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 04:44:13 +0000
Subject: [PATCH 09/48] Improve separation

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py  |  2 +-
 vllm/benchmarks/sweep/serve.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 51a8305ac830..0291be80de38 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -112,7 +112,7 @@ def plot(
             + ("--" if file_prefix else "")
             + "-".join(
                 (
-                    "FIGURE",
+                    "FIGURE-",
                     *(f"{k}={v}" for k, v in fig_group),
                 )
             )
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 960c229c4999..5751e326578b 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -108,9 +108,9 @@ def _get_comb_base_path(
 ):
     return output_dir / "-".join(
         (
-            "SERVE",
+            "SERVE-",
             serve_comb.as_text(sep="-"),
-            "BENCH",
+            "BENCH-",
             bench_comb.as_text(sep="-"),
         )
     ).replace("/", "_").replace("..", "__")  # Sanitize
@@ -228,9 +228,9 @@ def _get_sla_base_path(
 ):
     return output_dir / "-".join(
         (
-            "SERVE",
+            "SERVE-",
             serve_comb.as_text(sep="-"),
-            "BENCH",
+            "BENCH-",
             bench_comb.as_text(sep="-"),
         )
     ).replace("/", "_").replace("..", "__")  # Sanitize
@@ -244,7 +244,7 @@ def _get_sla_iter_path(
 ):
     if sla_value is None:
         prefix = sla_comb.as_text(sep="-")
-        return base_path / f"SLA-{prefix}.json"
+        return base_path / f"SLA--{prefix}.json"
 
     return base_path / f"{sla_variable}={sla_value}"
 

From c848b10945018f5a0d074afb59825f1e2f4c4351 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 08:58:05 +0000
Subject: [PATCH 10/48] Set by directory, not prefix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 0291be80de38..ff72802431cc 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -84,6 +84,7 @@ def _plot_fig(
 
 def plot(
     output_dir: Path,
+    fig_dir: Path,
     fig_by: list[str],
     curve_by: list[str],
     *,
@@ -92,7 +93,6 @@ def plot(
     max_x: float | None,
     bin_x: float | None,
     log_y: bool,
-    file_prefix: str,
     dry_run: bool,
 ):
     all_data = [
@@ -107,10 +107,8 @@ def plot(
     ):
         fig_group = tuple(fig_group)
 
-        fig_path = output_dir / (
-            file_prefix
-            + ("--" if file_prefix else "")
-            + "-".join(
+        fig_path = fig_dir / (
+            "-".join(
                 (
                     "FIGURE-",
                     *(f"{k}={v}" for k, v in fig_group),
@@ -146,8 +144,15 @@ def main():
         "OUTPUT_DIR",
         type=str,
         default="results",
-        help="The directory containing the results to plot. "
-        "Figures will be saved to the same directory.",
+        help="The directory containing the results to plot, "
+        "i.e., the `--output-dir` argument to the parameter sweep script.",
+    )
+    parser.add_argument(
+        "--fig-dir",
+        type=str,
+        default=None,
+        help="The directory to save the figures. "
+        "By default, this is set to `OUTPUT_DIR`.",
     )
     parser.add_argument(
         "--curve-by",
@@ -193,13 +198,6 @@ def main():
         action="store_true",
         help="Use logarithmic scaling for the y-axis.",
     )
-    parser.add_argument(
-        "--file-prefix",
-        type=str,
-        default="",
-        help="If set, prepends this to the filename of the saved figures to "
-        "distinguish them from other runs of this script.",
-    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -213,6 +211,7 @@ def main():
 
     plot(
         output_dir=Path(args.OUTPUT_DIR),
+        fig_dir=Path(args.fig_dir or args.OUTPUT_DIR),
         fig_by=fig_by,
         curve_by=curve_by,
         var_x=args.var_x,
@@ -220,7 +219,6 @@ def main():
         max_x=args.max_x,
         bin_x=args.bin_x,
         log_y=args.log_y,
-        file_prefix=args.file_prefix,
         dry_run=args.dry_run,
     )
 

From f7f36f2767d1b04aa4aec11ad7c0f77940e2a37d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 09:25:04 +0000
Subject: [PATCH 11/48] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index ff72802431cc..4b38ab49e3ca 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -101,6 +101,8 @@ def plot(
         for run_data in _json_load_bytes(path)
     ]
 
+    fig_dir.mkdir(parents=True, exist_ok=True)
+
     for fig_group, fig_data in full_groupby(
         all_data,
         key=lambda item: tuple((k, str(item[k])) for k in fig_by),

From c6cb78a2b05ba401f06d7ac4b92c0cf2bb3512e5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 09:32:18 +0000
Subject: [PATCH 12/48] Plot in parallel

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 71 ++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 4b38ab49e3ca..759705bbc2cb 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
+from concurrent.futures import Future, ProcessPoolExecutor
 from pathlib import Path
 
 import matplotlib.pyplot as plt
@@ -103,39 +104,49 @@ def plot(
 
     fig_dir.mkdir(parents=True, exist_ok=True)
 
-    for fig_group, fig_data in full_groupby(
-        all_data,
-        key=lambda item: tuple((k, str(item[k])) for k in fig_by),
-    ):
-        fig_group = tuple(fig_group)
-
-        fig_path = fig_dir / (
-            "-".join(
-                (
-                    "FIGURE-",
-                    *(f"{k}={v}" for k, v in fig_group),
+    with ProcessPoolExecutor() as pool:
+        tasks = list[Future[None]]()
+
+        for fig_group, fig_data in full_groupby(
+            all_data,
+            key=lambda item: tuple((k, str(item[k])) for k in fig_by),
+        ):
+            fig_group = tuple(fig_group)
+
+            fig_path = fig_dir / (
+                "-".join(
+                    (
+                        "FIGURE-",
+                        *(f"{k}={v}" for k, v in fig_group),
+                    )
                 )
+                .replace("/", "_")
+                .replace("..", "__")  # Sanitize
+                + ".png"
+            )
+            fig_title = (
+                ", ".join(f"{k}={v}" for k, v in fig_group)
+                if fig_group
+                else "(All data)"
             )
-            .replace("/", "_")
-            .replace("..", "__")  # Sanitize
-            + ".png"
-        )
-        fig_title = (
-            ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)"
-        )
 
-        _plot_fig(
-            fig_path,
-            fig_title,
-            fig_data,
-            curve_by,
-            var_x=var_x,
-            var_y=var_y,
-            max_x=max_x,
-            bin_x=bin_x,
-            log_y=log_y,
-            dry_run=dry_run,
-        )
+            task = pool.submit(
+                _plot_fig,
+                fig_path,
+                fig_title,
+                fig_data,
+                curve_by,
+                var_x=var_x,
+                var_y=var_y,
+                max_x=max_x,
+                bin_x=bin_x,
+                log_y=log_y,
+                dry_run=dry_run,
+            )
+            tasks.append(task)
+
+        for f in tasks:
+            f.result()
 
 
 def main():

From 4cc5e90a91556426862847c377c05854e9cb1c9f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 09:39:30 +0000
Subject: [PATCH 13/48] Clean up

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 96 +++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 37 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 759705bbc2cb..d268d58b6d87 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
-from concurrent.futures import Future, ProcessPoolExecutor
+from collections.abc import Iterable
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
 from pathlib import Path
 
 import matplotlib.pyplot as plt
@@ -83,6 +85,51 @@ def _plot_fig(
     print("[END FIGURE]")
 
 
+def _plot_fig_by_group(
+    fig_dir: Path,
+    fig_group_data: tuple[Iterable[tuple[str, str]], list[dict[str, object]]],
+    curve_by: list[str],
+    *,
+    var_x: str,
+    var_y: str,
+    max_x: float | None,
+    bin_x: float | None,
+    log_y: bool,
+    dry_run: bool,
+):
+    fig_group, fig_data = fig_group_data
+
+    fig_group = tuple(fig_group)
+
+    fig_path = fig_dir / (
+        "-".join(
+            (
+                "FIGURE-",
+                *(f"{k}={v}" for k, v in fig_group),
+            )
+        )
+        .replace("/", "_")
+        .replace("..", "__")  # Sanitize
+        + ".png"
+    )
+    fig_title = (
+        ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)"
+    )
+
+    return _plot_fig(
+        fig_path,
+        fig_title,
+        fig_data,
+        curve_by,
+        var_x=var_x,
+        var_y=var_y,
+        max_x=max_x,
+        bin_x=bin_x,
+        log_y=log_y,
+        dry_run=dry_run,
+    )
+
+
 def plot(
     output_dir: Path,
     fig_dir: Path,
@@ -105,48 +152,23 @@ def plot(
     fig_dir.mkdir(parents=True, exist_ok=True)
 
     with ProcessPoolExecutor() as pool:
-        tasks = list[Future[None]]()
-
-        for fig_group, fig_data in full_groupby(
-            all_data,
-            key=lambda item: tuple((k, str(item[k])) for k in fig_by),
-        ):
-            fig_group = tuple(fig_group)
-
-            fig_path = fig_dir / (
-                "-".join(
-                    (
-                        "FIGURE-",
-                        *(f"{k}={v}" for k, v in fig_group),
-                    )
-                )
-                .replace("/", "_")
-                .replace("..", "__")  # Sanitize
-                + ".png"
-            )
-            fig_title = (
-                ", ".join(f"{k}={v}" for k, v in fig_group)
-                if fig_group
-                else "(All data)"
-            )
-
-            task = pool.submit(
-                _plot_fig,
-                fig_path,
-                fig_title,
-                fig_data,
-                curve_by,
+        pool.map(
+            partial(
+                _plot_fig_by_group,
+                fig_dir,
+                curve_by=curve_by,
                 var_x=var_x,
                 var_y=var_y,
                 max_x=max_x,
                 bin_x=bin_x,
                 log_y=log_y,
                 dry_run=dry_run,
-            )
-            tasks.append(task)
-
-        for f in tasks:
-            f.result()
+            ),
+            full_groupby(
+                all_data,
+                key=lambda item: tuple((k, str(item[k])) for k in fig_by),
+            ),
+        )
 
 
 def main():

From 4af1e1af74d50a61ab4cf32c6d7565b0a071134e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 09:45:20 +0000
Subject: [PATCH 14/48] Don't silently fail

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index d268d58b6d87..bdc0bbefea76 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -35,11 +35,17 @@ def _plot_fig(
     print("[BEGIN FIGURE]")
     print(f"Output file: {fig_path}")
 
-    if dry_run:
-        print("[END FIGURE]")
-        return
-
     df = pd.DataFrame.from_records(fig_data)
+    if var_x not in df.columns:
+        raise ValueError(
+            f"Cannot find {var_x=!r} in parameter sweep results. "
+            f"Available variables: {df.columns}"
+        )
+    if var_y not in df.columns:
+        raise ValueError(
+            f"Cannot find {var_y=!r} in parameter sweep results. "
+            f"Available variables: {df.columns}"
+        )
 
     if max_x is not None:
         df = df[df[var_x] <= max_x]
@@ -47,6 +53,10 @@ def _plot_fig(
     if bin_x is not None:
         df[var_x] = df[var_x] // bin_x * bin_x
 
+    if dry_run:
+        print("[END FIGURE]")
+        return
+
     if len(curve_by) <= 3:
         hue, style, size, *_ = (*curve_by, None, None)
         ax = sns.lineplot(
@@ -152,7 +162,7 @@ def plot(
     fig_dir.mkdir(parents=True, exist_ok=True)
 
     with ProcessPoolExecutor() as pool:
-        pool.map(
+        out = pool.map(
             partial(
                 _plot_fig_by_group,
                 fig_dir,
@@ -170,6 +180,9 @@ def plot(
             ),
         )
 
+        # Collect the results
+        all(out)
+
 
 def main():
     parser = argparse.ArgumentParser(

From 7d82607ea758009392a82452f8652b9ab2e1427b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 09:45:53 +0000
Subject: [PATCH 15/48] Pretty

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index bdc0bbefea76..b98dbf801214 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -39,12 +39,12 @@ def _plot_fig(
     if var_x not in df.columns:
         raise ValueError(
             f"Cannot find {var_x=!r} in parameter sweep results. "
-            f"Available variables: {df.columns}"
+            f"Available variables: {df.columns.tolist()}"
         )
     if var_y not in df.columns:
         raise ValueError(
             f"Cannot find {var_y=!r} in parameter sweep results. "
-            f"Available variables: {df.columns}"
+            f"Available variables: {df.columns.tolist()}"
         )
 
     if max_x is not None:

From 8150f441b075d0166958ce297189e61992ebf192 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 12:22:39 +0000
Subject: [PATCH 16/48] Fix nested

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/param_sweep.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
index bddf0b98ae29..90006e02ba6a 100644
--- a/vllm/benchmarks/sweep/param_sweep.py
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -40,6 +40,14 @@ def __or__(self, other: dict[str, Any]):
 
     # In JSON, we prefer "_"
     def _iter_param_key_candidates(self, param_key: str):
+        # Inner config arguments are not converted by the CLI
+        if "." in param_key:
+            prefix, rest = param_key.split(".", 1)
+            for prefix_candidate in self._iter_param_key_candidates(prefix):
+                yield prefix_candidate + "." + rest
+
+            return
+
         yield param_key
         yield param_key.replace("-", "_")
         yield param_key.replace("_", "-")

From 6ace5b2bae67f06d0893ace1164fab4889f46e68 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 12:55:38 +0000
Subject: [PATCH 17/48] Raise error if no data found

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index b98dbf801214..34284b0ec8bd 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -159,6 +159,9 @@ def plot(
         for run_data in _json_load_bytes(path)
     ]
 
+    if not all_data:
+        raise ValueError(f"Did not find any parameter sweep results under {output_dir}")
+
     fig_dir.mkdir(parents=True, exist_ok=True)
 
     with ProcessPoolExecutor() as pool:

From b3eb7cdb74f44b9995536de06287efeb06e58872 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 13:06:49 +0000
Subject: [PATCH 18/48] Show the problematic data item

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 34284b0ec8bd..66093cb32caf 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -19,6 +19,13 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]:
         return json.load(f)
 
 
+def _get_metric(run_data: dict[str, object], metric_key: str):
+    try:
+        return run_data[metric_key]
+    except KeyError as exc:
+        raise ValueError(f"Cannot find metric {metric_key!r} in {run_data=}") from exc
+
+
 def _plot_fig(
     fig_path: Path,
     fig_title: str,
@@ -179,7 +186,7 @@ def plot(
             ),
             full_groupby(
                 all_data,
-                key=lambda item: tuple((k, str(item[k])) for k in fig_by),
+                key=lambda item: tuple((k, str(_get_metric(item, k))) for k in fig_by),
             ),
         )
 

From 8154e084b50277c5fa453031037e0a92418ac2d3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 13:19:06 +0000
Subject: [PATCH 19/48] Convert to string first

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 66093cb32caf..4783a34e570d 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -76,7 +76,7 @@ def _plot_fig(
             markers=True,
         )
     else:
-        df["params"] = df[list(curve_by)].agg("-".join, axis=1)
+        df["params"] = df[list(curve_by)].astype(str).agg("-".join, axis=1)
         ax = sns.lineplot(
             df,
             x=var_x,

From d9fcb097e7ed5a0bb5fc70c41b490a291daba710 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 13:30:55 +0000
Subject: [PATCH 20/48] Be more clear

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 4783a34e570d..b722d2b23ffb 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -76,7 +76,11 @@ def _plot_fig(
             markers=True,
         )
     else:
-        df["params"] = df[list(curve_by)].astype(str).agg("-".join, axis=1)
+        df["params"] = pd.concat(
+            [k + "=" + df[k].astype(str) for k in curve_by],
+            axis=1,
+        ).agg("-".join, axis=1)
+
         ax = sns.lineplot(
             df,
             x=var_x,

From 9c0e9faf3359c39281bf37a05dc3963ac646fe07 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:09:12 +0000
Subject: [PATCH 21/48] Use seaborn grid

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 244 +++++++++++++++++++++-------------
 1 file changed, 151 insertions(+), 93 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index b722d2b23ffb..f2dfb6ee1c73 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
-from collections.abc import Iterable
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from pathlib import Path
+from types import TracebackType
 
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
+from typing_extensions import Self
 
 from vllm.utils.collections import full_groupby
 
@@ -26,10 +27,51 @@ def _get_metric(run_data: dict[str, object], metric_key: str):
         raise ValueError(f"Cannot find metric {metric_key!r} in {run_data=}") from exc
 
 
+def _get_group(run_data: dict[str, object], group_keys: list[str]):
+    return tuple((k, str(_get_metric(run_data, k))) for k in group_keys)
+
+
+def _get_fig_path(
+    fig_dir: Path,
+    group: tuple[tuple[str, str], ...],
+):
+    return fig_dir / (
+        "-".join(
+            (
+                "FIGURE-",
+                *(f"{k}={v}" for k, v in group),
+            )
+        )
+        .replace("/", "_")
+        .replace("..", "__")  # Sanitize
+        + ".png"
+    )
+
+
+def _get_fig_title(group: tuple[tuple[str, str], ...]):
+    return ", ".join(f"{k}={v}" for k, v in group) if group else "(All)"
+
+
+class DummyExecutor:
+    map = map
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        return None
+
+
 def _plot_fig(
-    fig_path: Path,
-    fig_title: str,
-    fig_data: list[dict[str, object]],
+    fig_dir: Path,
+    fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]],
+    row_by: list[str],
+    col_by: list[str],
     curve_by: list[str],
     *,
     var_x: str,
@@ -39,10 +81,31 @@ def _plot_fig(
     log_y: bool,
     dry_run: bool,
 ):
+    fig_group, fig_data = fig_group_data
+
+    row_groups = full_groupby(
+        fig_data,
+        key=lambda item: _get_group(item, row_by),
+    )
+    num_rows = len(row_groups)
+    num_cols = max(
+        len(full_groupby(row_data, key=lambda item: _get_group(item, col_by)))
+        for _, row_data in row_groups
+    )
+
+    fig_path = _get_fig_path(fig_dir, fig_group)
+
     print("[BEGIN FIGURE]")
+    print(f"Group: {dict(fig_group)}")
+    print(f"Grid: {num_rows} rows x {num_cols} cols")
     print(f"Output file: {fig_path}")
 
+    if dry_run:
+        print("[END FIGURE]")
+        return
+
     df = pd.DataFrame.from_records(fig_data)
+
     if var_x not in df.columns:
         raise ValueError(
             f"Cannot find {var_x=!r} in parameter sweep results. "
@@ -60,14 +123,35 @@ def _plot_fig(
     if bin_x is not None:
         df[var_x] = df[var_x] // bin_x * bin_x
 
-    if dry_run:
-        print("[END FIGURE]")
-        return
+    df["row_group"] = (
+        pd.concat(
+            [k + "=" + df[k].astype(str) for k in row_by],
+            axis=1,
+        ).agg("-".join, axis=1)
+        if row_by
+        else "(All)"
+    )
+
+    df["col_group"] = (
+        pd.concat(
+            [k + "=" + df[k].astype(str) for k in col_by],
+            axis=1,
+        ).agg("-".join, axis=1)
+        if col_by
+        else "(All)"
+    )
+
+    g = sns.FacetGrid(df, row="row_group", col="col_group")
+
+    g.set_titles("{row_name},{col_name}")
+
+    if log_y:
+        g.set(yscale="log")
 
     if len(curve_by) <= 3:
         hue, style, size, *_ = (*curve_by, None, None)
-        ax = sns.lineplot(
-            df,
+        g.map_dataframe(
+            sns.lineplot,
             x=var_x,
             y=var_y,
             hue=hue,
@@ -76,85 +160,37 @@ def _plot_fig(
             markers=True,
         )
     else:
-        df["params"] = pd.concat(
-            [k + "=" + df[k].astype(str) for k in curve_by],
-            axis=1,
-        ).agg("-".join, axis=1)
+        df["curve_group"] = (
+            pd.concat(
+                [k + "=" + df[k].astype(str) for k in curve_by],
+                axis=1,
+            ).agg("-".join, axis=1)
+            if curve_by
+            else "(All)"
+        )
 
-        ax = sns.lineplot(
-            df,
+        g.map_dataframe(
+            sns.lineplot,
             x=var_x,
             y=var_y,
-            hue="params",
+            hue="curve_group",
             markers=True,
         )
 
-    ax.set_title(fig_title)
-
-    if log_y:
-        ax.set_yscale("log")
-
-    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
-
-    fig = ax.get_figure()
-    assert fig is not None
+    g.add_legend()
 
-    fig.tight_layout()
-    fig.savefig(fig_path)
-    plt.close(fig)
+    g.savefig(fig_path)
+    plt.close(g.figure)
 
     print("[END FIGURE]")
 
 
-def _plot_fig_by_group(
-    fig_dir: Path,
-    fig_group_data: tuple[Iterable[tuple[str, str]], list[dict[str, object]]],
-    curve_by: list[str],
-    *,
-    var_x: str,
-    var_y: str,
-    max_x: float | None,
-    bin_x: float | None,
-    log_y: bool,
-    dry_run: bool,
-):
-    fig_group, fig_data = fig_group_data
-
-    fig_group = tuple(fig_group)
-
-    fig_path = fig_dir / (
-        "-".join(
-            (
-                "FIGURE-",
-                *(f"{k}={v}" for k, v in fig_group),
-            )
-        )
-        .replace("/", "_")
-        .replace("..", "__")  # Sanitize
-        + ".png"
-    )
-    fig_title = (
-        ", ".join(f"{k}={v}" for k, v in fig_group) if fig_group else "(All data)"
-    )
-
-    return _plot_fig(
-        fig_path,
-        fig_title,
-        fig_data,
-        curve_by,
-        var_x=var_x,
-        var_y=var_y,
-        max_x=max_x,
-        bin_x=bin_x,
-        log_y=log_y,
-        dry_run=dry_run,
-    )
-
-
 def plot(
     output_dir: Path,
     fig_dir: Path,
     fig_by: list[str],
+    row_by: list[str],
+    col_by: list[str],
     curve_by: list[str],
     *,
     var_x: str,
@@ -175,27 +211,31 @@ def plot(
 
     fig_dir.mkdir(parents=True, exist_ok=True)
 
-    with ProcessPoolExecutor() as pool:
-        out = pool.map(
-            partial(
-                _plot_fig_by_group,
-                fig_dir,
-                curve_by=curve_by,
-                var_x=var_x,
-                var_y=var_y,
-                max_x=max_x,
-                bin_x=bin_x,
-                log_y=log_y,
-                dry_run=dry_run,
-            ),
-            full_groupby(
-                all_data,
-                key=lambda item: tuple((k, str(_get_metric(item, k))) for k in fig_by),
-            ),
-        )
+    fig_groups = full_groupby(
+        all_data,
+        key=lambda item: _get_group(item, fig_by),
+    )
 
-        # Collect the results
-        all(out)
+    with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor:
+        # Resolve the iterable to ensure that the workers are run
+        all(
+            executor.map(
+                partial(
+                    _plot_fig,
+                    fig_dir,
+                    row_by=row_by,
+                    col_by=col_by,
+                    curve_by=curve_by,
+                    var_x=var_x,
+                    var_y=var_y,
+                    max_x=max_x,
+                    bin_x=bin_x,
+                    log_y=log_y,
+                    dry_run=dry_run,
+                ),
+                fig_groups,
+            )
+        )
 
 
 def main():
@@ -223,6 +263,20 @@ def main():
         help="A comma-separated list of variables, such that a separate curve "
         "is created for each combination of these variables.",
     )
+    parser.add_argument(
+        "--col-by",
+        type=str,
+        default="",
+        help="A comma-separated list of variables, such that a separate column "
+        "is created for each combination of these variables.",
+    )
+    parser.add_argument(
+        "--row-by",
+        type=str,
+        default="",
+        help="A comma-separated list of variables, such that a separate row "
+        "is created for each combination of these variables.",
+    )
     parser.add_argument(
         "--fig-by",
         type=str,
@@ -269,12 +323,16 @@ def main():
     args = parser.parse_args()
 
     curve_by = [] if not args.curve_by else args.curve_by.split(",")
+    row_by = [] if not args.row_by else args.row_by.split(",")
+    col_by = [] if not args.col_by else args.col_by.split(",")
     fig_by = [] if not args.fig_by else args.fig_by.split(",")
 
     plot(
         output_dir=Path(args.OUTPUT_DIR),
         fig_dir=Path(args.fig_dir or args.OUTPUT_DIR),
         fig_by=fig_by,
+        row_by=row_by,
+        col_by=col_by,
         curve_by=curve_by,
         var_x=args.var_x,
         var_y=args.var_y,

From f1810ccc5232c933aae938c121db530e0dc991bb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:15:54 +0000
Subject: [PATCH 22/48] Clean up

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index f2dfb6ee1c73..f722c19d95a6 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -143,13 +143,20 @@ def _plot_fig(
 
     g = sns.FacetGrid(df, row="row_group", col="col_group")
 
-    g.set_titles("{row_name},{col_name}")
+    if row_by and col_by:
+        g.set_titles("{row_name},{col_name}")
+    elif row_by:
+        g.set_titles("{row_name}")
+    elif col_by:
+        g.set_titles("{col_name}")
+    else:
+        g.set_titles("")
 
     if log_y:
         g.set(yscale="log")
 
     if len(curve_by) <= 3:
-        hue, style, size, *_ = (*curve_by, None, None)
+        hue, style, size, *_ = (*curve_by, None, None, None)
         g.map_dataframe(
             sns.lineplot,
             x=var_x,
@@ -259,7 +266,7 @@ def main():
     parser.add_argument(
         "--curve-by",
         type=str,
-        required=True,
+        default=None,
         help="A comma-separated list of variables, such that a separate curve "
         "is created for each combination of these variables.",
     )

From dac464b6665fbc054baf928513d9c612b3e65162 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:17:37 +0000
Subject: [PATCH 23/48] Clean

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index f722c19d95a6..192709abe015 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -127,7 +127,7 @@ def _plot_fig(
         pd.concat(
             [k + "=" + df[k].astype(str) for k in row_by],
             axis=1,
-        ).agg("-".join, axis=1)
+        ).agg("\n".join, axis=1)
         if row_by
         else "(All)"
     )
@@ -136,7 +136,7 @@ def _plot_fig(
         pd.concat(
             [k + "=" + df[k].astype(str) for k in col_by],
             axis=1,
-        ).agg("-".join, axis=1)
+        ).agg("\n".join, axis=1)
         if col_by
         else "(All)"
     )
@@ -144,7 +144,7 @@ def _plot_fig(
     g = sns.FacetGrid(df, row="row_group", col="col_group")
 
     if row_by and col_by:
-        g.set_titles("{row_name},{col_name}")
+        g.set_titles("{row_name}\n{col_name}")
     elif row_by:
         g.set_titles("{row_name}")
     elif col_by:
@@ -157,6 +157,7 @@ def _plot_fig(
 
     if len(curve_by) <= 3:
         hue, style, size, *_ = (*curve_by, None, None, None)
+
         g.map_dataframe(
             sns.lineplot,
             x=var_x,
@@ -171,7 +172,7 @@ def _plot_fig(
             pd.concat(
                 [k + "=" + df[k].astype(str) for k in curve_by],
                 axis=1,
-            ).agg("-".join, axis=1)
+            ).agg("\n".join, axis=1)
             if curve_by
             else "(All)"
         )

From 73c911b28be6b803a65c4c463c478a5f433666b7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:19:59 +0000
Subject: [PATCH 24/48] TODO

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 192709abe015..9ed3e4ca09e5 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -31,10 +31,7 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]):
     return tuple((k, str(_get_metric(run_data, k))) for k in group_keys)
 
 
-def _get_fig_path(
-    fig_dir: Path,
-    group: tuple[tuple[str, str], ...],
-):
+def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]):
     return fig_dir / (
         "-".join(
             (
@@ -48,10 +45,6 @@ def _get_fig_path(
     )
 
 
-def _get_fig_title(group: tuple[tuple[str, str], ...]):
-    return ", ".join(f"{k}={v}" for k, v in group) if group else "(All)"
-
-
 class DummyExecutor:
     map = map
 
@@ -117,6 +110,8 @@ def _plot_fig(
             f"Available variables: {df.columns.tolist()}"
         )
 
+    # TODO: Support <KEY><OP><VALUE> syntax
+    # e.g. request_rate<=1024%2 means max of 1024 and bin size of 2
     if max_x is not None:
         df = df[df[var_x] <= max_x]
 

From eef9c40907382f5e94268a1e6a79c24b14f717d3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:24:20 +0000
Subject: [PATCH 25/48] Clean

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py  | 2 +-
 vllm/benchmarks/sweep/serve.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 9ed3e4ca09e5..ffc15af46021 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -35,7 +35,7 @@ def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]):
     return fig_dir / (
         "-".join(
             (
-                "FIGURE-",
+                "FIGURE" + ("-" if group else ""),
                 *(f"{k}={v}" for k, v in group),
             )
         )
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 5751e326578b..5599a5dbc78b 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -108,9 +108,9 @@ def _get_comb_base_path(
 ):
     return output_dir / "-".join(
         (
-            "SERVE-",
+            "SERVE" + ("-" if serve_comb else ""),
             serve_comb.as_text(sep="-"),
-            "BENCH-",
+            "BENCH" + ("-" if bench_comb else ""),
             bench_comb.as_text(sep="-"),
         )
     ).replace("/", "_").replace("..", "__")  # Sanitize
@@ -228,9 +228,9 @@ def _get_sla_base_path(
 ):
     return output_dir / "-".join(
         (
-            "SERVE-",
+            "SERVE" + ("-" if serve_comb else ""),
             serve_comb.as_text(sep="-"),
-            "BENCH-",
+            "BENCH" + ("-" if bench_comb else ""),
             bench_comb.as_text(sep="-"),
         )
     ).replace("/", "_").replace("..", "__")  # Sanitize

From aa9615149480f07cb64d69d549b11bfe858ad3fb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:47:35 +0000
Subject: [PATCH 26/48] Generalized filter and binning

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/benchmarks.md |   5 +-
 vllm/benchmarks/sweep/plot.py   | 201 +++++++++++++++++++++++++++-----
 2 files changed, 172 insertions(+), 34 deletions(-)

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index ae8582113f26..6ff475422e9f 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1075,9 +1075,10 @@ Example command:
 ```bash
 python vllm/benchmarks/sweep/plot.py benchmarks/results/<timestamp> \
     --var-x max_concurrency \
-    --max-x 1024 \
     --curve-by api_server_count,max_num_batched_tokens \
-    --file-by random_input_len,random_output_len
+    --row-by random_input_len \
+    --col-by random_output_len \
+    --filter-by 'max_concurrency<=1024'
 ```
 
 !!! tip
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index ffc15af46021..53887c82ee49 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json
+from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 from pathlib import Path
@@ -10,11 +11,130 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
-from typing_extensions import Self
+from typing_extensions import Self, override
 
 from vllm.utils.collections import full_groupby
 
 
+class PlotFilterBase(ABC):
+    @classmethod
+    def parse_str(cls, s: str):
+        for op_key in PLOT_FILTERS:
+            if op_key in s:
+                key, value = s.split(op_key)
+                return PLOT_FILTERS[op_key](key, float(value.removeprefix(op_key)))
+        else:
+            raise ValueError(
+                f"Invalid operator for plot filter '{s}'. "
+                f"Valid operators are: {set(PLOT_FILTERS)}",
+            )
+
+    def __init__(self, var: str, target: float) -> None:
+        super().__init__()
+
+        self.var = var
+        self.target = target
+
+    @abstractmethod
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Applies this filter to a DataFrame."""
+        raise NotImplementedError
+
+
+class PlotLessThan(PlotFilterBase):
+    @override
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df[df[self.var] < self.target]
+
+
+class PlotLessThanOrEqual(PlotFilterBase):
+    @override
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df[df[self.var] <= self.target]
+
+
+class PlotGreaterThan(PlotFilterBase):
+    @override
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df[df[self.var] > self.target]
+
+
+class PlotGreaterThanOrEqual(PlotFilterBase):
+    @override
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df[df[self.var] >= self.target]
+
+
+# NOTE: The ordering is important! Match longer op_keys first
+PLOT_FILTERS: dict[str, type[PlotFilterBase]] = {
+    "<=": PlotLessThanOrEqual,
+    ">=": PlotGreaterThanOrEqual,
+    "<": PlotLessThan,
+    ">": PlotGreaterThan,
+}
+
+
+class PlotFilters(list[PlotFilterBase]):
+    @classmethod
+    def parse_str(cls, s: str):
+        if not s:
+            return cls()
+
+        return cls(PlotFilterBase.parse_str(e) for e in s.split(","))
+
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        for item in self:
+            df = item.apply(df)
+
+        return df
+
+
+class PlotBinner:
+    @classmethod
+    def parse_str(cls, s: str):
+        for op_key in PLOT_BINNERS:
+            if op_key in s:
+                key, value = s.split(op_key)
+                return PLOT_BINNERS[op_key](key, float(value.removeprefix(op_key)))
+        else:
+            raise ValueError(
+                f"Invalid operator for plot binner '{s}'. "
+                f"Valid operators are: {set(PLOT_BINNERS)}",
+            )
+
+    def __init__(self, var: str, bin_size: float) -> None:
+        super().__init__()
+
+        self.var = var
+        self.bin_size = bin_size
+
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Applies this binner to a DataFrame."""
+        df = df.copy()
+        df[self.var] = df[self.var] // self.bin_size * self.bin_size
+        return df
+
+
+PLOT_BINNERS: dict[str, type[PlotBinner]] = {
+    "@": PlotBinner,
+}
+
+
+class PlotBinners(list[PlotBinner]):
+    @classmethod
+    def parse_str(cls, s: str):
+        if not s:
+            return cls()
+
+        return cls(PlotBinner.parse_str(e) for e in s.split(","))
+
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        for item in self:
+            df = item.apply(df)
+
+        return df
+
+
 def _json_load_bytes(path: Path) -> list[dict[str, object]]:
     with path.open("rb") as f:
         return json.load(f)
@@ -69,9 +189,10 @@ def _plot_fig(
     *,
     var_x: str,
     var_y: str,
-    max_x: float | None,
-    bin_x: float | None,
-    log_y: bool,
+    filter_by: PlotFilters,
+    bin_by: PlotBinners,
+    scale_x: str | None,
+    scale_y: str | None,
     dry_run: bool,
 ):
     fig_group, fig_data = fig_group_data
@@ -110,13 +231,8 @@ def _plot_fig(
             f"Available variables: {df.columns.tolist()}"
         )
 
-    # TODO: Support <KEY><OP><VALUE> syntax
-    # e.g. request_rate<=1024%2 means max of 1024 and bin size of 2
-    if max_x is not None:
-        df = df[df[var_x] <= max_x]
-
-    if bin_x is not None:
-        df[var_x] = df[var_x] // bin_x * bin_x
+    df = filter_by.apply(df)
+    df = bin_by.apply(df)
 
     df["row_group"] = (
         pd.concat(
@@ -147,8 +263,10 @@ def _plot_fig(
     else:
         g.set_titles("")
 
-    if log_y:
-        g.set(yscale="log")
+    if scale_x:
+        g.set(xscale=scale_x)
+    if scale_y:
+        g.set(yscale=scale_y)
 
     if len(curve_by) <= 3:
         hue, style, size, *_ = (*curve_by, None, None, None)
@@ -198,9 +316,10 @@ def plot(
     *,
     var_x: str,
     var_y: str,
-    max_x: float | None,
-    bin_x: float | None,
-    log_y: bool,
+    filter_by: PlotFilters,
+    bin_by: PlotBinners,
+    scale_x: str | None,
+    scale_y: str | None,
     dry_run: bool,
 ):
     all_data = [
@@ -231,9 +350,10 @@ def plot(
                     curve_by=curve_by,
                     var_x=var_x,
                     var_y=var_y,
-                    max_x=max_x,
-                    bin_x=bin_x,
-                    log_y=log_y,
+                    filter_by=filter_by,
+                    bin_by=bin_by,
+                    scale_x=scale_x,
+                    scale_y=scale_y,
                     dry_run=dry_run,
                 ),
                 fig_groups,
@@ -300,22 +420,38 @@ def main():
         help="The variable for the y-axis",
     )
     parser.add_argument(
-        "--max-x",
-        type=float,
-        default=None,
-        help="The maximum value to plot for the x-axis.",
+        "--filter-by",
+        type=str,
+        default="",
+        help="A comma-separated list of statements indicating values to filter by. "
+        "This is useful to remove outliers. "
+        "Example: `max_concurrency<1000,max_num_batched_tokens<=4096` means "
+        "plot only the points where `max_concurrency` is less than 1000 and "
+        "`max_num_batched_tokens` is no greater than 4096.",
     )
     parser.add_argument(
-        "--bin-x",
-        type=float,
+        "--bin-by",
+        type=str,
+        default="",
+        help="A comma-separated list of statements indicating values to bin by. "
+        "This is useful to avoid plotting points that are too close together. "
+        "Example: `request_throughput%1` means "
+        "use a bin size of 1 for the `request_throughput` variable.",
+    )
+    parser.add_argument(
+        "--scale-x",
+        type=str,
         default=None,
-        help="Group together points with x-axis values in the same bin "
-        "to reduce noise.",
+        help="The scale to use for the x-axis. "
+        "Currently only accepts string values such as 'log' and 'sqrt'. "
+        "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",
     )
     parser.add_argument(
-        "--log-y",
+        "--scale-y",
         action="store_true",
-        help="Use logarithmic scaling for the y-axis.",
+        help="The scale to use for the y-axis. "
+        "Currently only accepts string values such as 'log' and 'sqrt'. "
+        "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",
     )
     parser.add_argument(
         "--dry-run",
@@ -339,9 +475,10 @@ def main():
         curve_by=curve_by,
         var_x=args.var_x,
         var_y=args.var_y,
-        max_x=args.max_x,
-        bin_x=args.bin_x,
-        log_y=args.log_y,
+        filter_by=PlotFilters.parse_str(args.filter_by),
+        bin_by=PlotBinners.parse_str(args.bin_by),
+        scale_x=args.scale_x,
+        scale_y=args.scale_y,
         dry_run=args.dry_run,
     )
 

From 0b984965a17d36c46ab480e75c699ca01d2e51fe Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:49:08 +0000
Subject: [PATCH 27/48] Remove old script

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/serve_multi.py | 1157 --------------------------------
 1 file changed, 1157 deletions(-)
 delete mode 100644 vllm/benchmarks/serve_multi.py

diff --git a/vllm/benchmarks/serve_multi.py b/vllm/benchmarks/serve_multi.py
deleted file mode 100644
index e8524473aedd..000000000000
--- a/vllm/benchmarks/serve_multi.py
+++ /dev/null
@@ -1,1157 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import argparse
-import contextlib
-import json
-import math
-import os
-import shlex
-import signal
-import subprocess
-from abc import ABC, abstractmethod
-from datetime import datetime
-from pathlib import Path
-from typing import Literal, get_args
-
-import pandas as pd
-import requests
-import seaborn as sns
-from typing_extensions import assert_never, override
-
-_BAD_PARAMS_TYPE_MSG = (
-    "The parameters to vary should be expressed as a JSON list of dictionaries."
-)
-
-
-def _parse_params(params: list[dict[str, object]]):
-    if not isinstance(params, list):
-        raise TypeError(f"{_BAD_PARAMS_TYPE_MSG} Found JSON type {type(params)}")
-
-    for comb in params:
-        if not isinstance(comb, dict):
-            raise TypeError(f"{_BAD_PARAMS_TYPE_MSG} Found item type {type(comb)}")
-
-    return params
-
-
-class SLACriterionBase(ABC):
-    def __init__(self, target: float) -> None:
-        super().__init__()
-
-        self.target = target
-
-    @abstractmethod
-    def validate(self, actual: float) -> bool:
-        """Return `True` if this criterion is met; otherwise `False`."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def format_cond(self, lhs: str) -> str:
-        raise NotImplementedError
-
-    def print_and_validate(
-        self,
-        metrics: dict[str, float],
-        metrics_key: str,
-    ) -> bool:
-        metric = metrics[metrics_key]
-        result = self.validate(metric)
-
-        cond = self.format_cond(f"{metrics_key} = {metric:.2f}")
-        print(f"Validating SLA: {cond} | " + ("PASSED" if result else "FAILED"))
-
-        return result
-
-
-class SLALessThan(SLACriterionBase):
-    @override
-    def validate(self, actual: float) -> bool:
-        return actual < self.target
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}<{self.target:.2f}"
-
-
-class SLALessThanOrEqual(SLACriterionBase):
-    @override
-    def validate(self, actual: float) -> bool:
-        return actual <= self.target
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}<={self.target:.2f}"
-
-
-class SLAGreaterThan(SLACriterionBase):
-    @override
-    def validate(self, actual: float) -> bool:
-        return actual > self.target
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}>{self.target:.2f}"
-
-
-class SLAGreaterThanOrEqual(SLACriterionBase):
-    @override
-    def validate(self, actual: float) -> bool:
-        return actual >= self.target
-
-    @override
-    def format_cond(self, lhs: str) -> str:
-        return f"{lhs}>={self.target:.2f}"
-
-
-# NOTE: The ordering is important! Match longer op_keys first
-SLA_CRITERIA: dict[str, type[SLACriterionBase]] = {
-    "<=": SLALessThanOrEqual,
-    ">=": SLAGreaterThanOrEqual,
-    "<": SLALessThan,
-    ">": SLAGreaterThan,
-}
-
-
-def _parse_sla_item(sla_item: dict[str, str]):
-    sla_criteria: dict[str, SLACriterionBase] = {}
-
-    for metric_key, metric_value in sla_item.items():
-        for op_key in SLA_CRITERIA:
-            if metric_value.startswith(op_key):
-                sla_criteria[metric_key] = SLA_CRITERIA[op_key](
-                    float(metric_value.removeprefix(op_key))
-                )
-                break
-        else:
-            raise ValueError(
-                f"Invalid operator for SLA constraint '{metric_key}={metric_value}'. "
-                f"Valid operators are: {set(SLA_CRITERIA)}",
-            )
-
-    return sla_criteria
-
-
-def _parse_sla(sla: list[dict[str, str]]):
-    return [_parse_sla_item(item) for item in sla]
-
-
-# In JSON, we prefer "_"
-def _iter_param_key_candidates(param_key: str):
-    yield param_key
-    yield param_key.replace("-", "_")
-    yield param_key.replace("_", "-")
-
-
-# In CLI, we prefer "-"
-def _iter_cmd_key_candidates(param_key: str):
-    for k in reversed(tuple(_iter_param_key_candidates(param_key))):
-        yield "--" + k
-
-
-def _normalize_cmd_key(param_key: str):
-    return next(_iter_cmd_key_candidates(param_key))
-
-
-def _override_args(cmd: list[str], params: dict[str, object]):
-    cmd = list(cmd)
-
-    for k, v in params.items():
-        for k_candidate in _iter_cmd_key_candidates(k):
-            try:
-                k_idx = cmd.index(k_candidate)
-
-                if isinstance(v, bool):
-                    cmd[k_idx] = _normalize_cmd_key(k if v else "no-" + k)
-                else:
-                    cmd[k_idx + 1] = str(v)
-
-                break
-            except ValueError:
-                continue
-        else:
-            if isinstance(v, bool):
-                cmd.append(_normalize_cmd_key(k if v else "no-" + k))
-            else:
-                cmd.extend([_normalize_cmd_key(k), str(v)])
-
-    return cmd
-
-
-class ServerWrapper:
-    def __init__(
-        self,
-        server_cmd: list[str],
-        after_bench_cmd: list[str],
-        *,
-        show_stdout: bool,
-    ) -> None:
-        super().__init__()
-
-        self.server_cmd = server_cmd
-        self.after_bench_cmd = after_bench_cmd
-        self.show_stdout = show_stdout
-
-    def run_subcommand(self, cmd: list[str]):
-        return subprocess.run(
-            cmd,
-            stdout=None if self.show_stdout else subprocess.DEVNULL,
-            check=True,
-        )
-
-    def after_bench(self) -> None:
-        if not self.after_bench_cmd:
-            self.reset_caches()
-            return
-
-        self.run_subcommand(self.after_bench_cmd)
-
-    def _get_vllm_server_address(self) -> str:
-        server_cmd = self.server_cmd
-
-        for host_key in ("--host",):
-            if host_key in server_cmd:
-                host = server_cmd[server_cmd.index(host_key) + 1]
-                break
-        else:
-            host = "localhost"
-
-        for port_key in ("-p", "--port"):
-            if port_key in server_cmd:
-                port = int(server_cmd[server_cmd.index(port_key) + 1])
-                break
-        else:
-            port = 8000  # The default value in vllm serve
-
-        return f"http://{host}:{port}"
-
-    def reset_caches(self) -> None:
-        server_cmd = self.server_cmd
-
-        # Use `.endswith()` to match `/bin/...`
-        if server_cmd[0].endswith("vllm"):
-            server_address = self._get_vllm_server_address()
-            print(f"Resetting caches at {server_address}")
-
-            res = requests.post(f"{server_address}/reset_prefix_cache")
-            res.raise_for_status()
-
-            res = requests.post(f"{server_address}/reset_mm_cache")
-            res.raise_for_status()
-        elif server_cmd[0].endswith("infinity_emb"):
-            if "--vector-disk-cache" in server_cmd:
-                raise NotImplementedError(
-                    "Infinity server uses caching but does not expose a method "
-                    "to reset the cache"
-                )
-        else:
-            raise NotImplementedError(
-                f"No implementation of `reset_caches` for `{server_cmd[0]}` server. "
-                "Please specify a custom command via `--after-bench-cmd`."
-            )
-
-
-@contextlib.contextmanager
-def _run_server(
-    serve_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_overrides: dict[str, object],
-    dry_run: bool,
-):
-    server_cmd = _override_args(serve_cmd, serve_overrides)
-
-    print("[BEGIN SERVER]")
-    print(f"Server overrides: {serve_overrides}")
-    print(f"Server command: {server_cmd}")
-
-    if dry_run:
-        yield None
-        print("[END SERVER]")
-        return
-
-    # Create new process group for clean termination
-    server_process = subprocess.Popen(
-        server_cmd,
-        start_new_session=True,
-        stdout=None if show_stdout else subprocess.DEVNULL,
-        # Need VLLM_SERVER_DEV_MODE=1 for `_reset_caches`
-        env={**os.environ, "VLLM_SERVER_DEV_MODE": "1"},
-    )
-
-    try:
-        yield ServerWrapper(
-            server_cmd,
-            after_bench_cmd,
-            show_stdout=show_stdout,
-        )
-    finally:
-        if server_process.poll() is None:
-            # In case only some processes have been terminated
-            with contextlib.suppress(ProcessLookupError):
-                # We need to kill both API Server and Engine processes
-                os.killpg(os.getpgid(server_process.pid), signal.SIGKILL)
-
-        print("[END SERVER]")
-
-
-def _run_benchmark(
-    server: ServerWrapper | None,
-    bench_cmd: list[str],
-    *,
-    serve_overrides: dict[str, object],
-    bench_overrides: dict[str, object],
-    run_number: int,
-    output_path: Path,
-    dry_run: bool,
-):
-    benchmark_cmd = [
-        *_override_args(bench_cmd, bench_overrides),
-        "--save-result",
-        "--result-dir",
-        str(output_path.parent),
-        "--result-filename",
-        output_path.name,
-    ]
-
-    print("[BEGIN BENCHMARK]")
-    print(f"Benchmark overrides: {bench_overrides}")
-    print(f"Run Number: {run_number}")
-    print(f"Benchmark command: {benchmark_cmd}")
-    print(f"Output file: {output_path}")
-
-    run_data: dict[str, object]
-
-    if output_path.exists():
-        print("Found existing results. Skipping.")
-
-        with output_path.open("rb") as f:
-            run_data = json.load(f)
-            return run_data
-
-    if server is None:
-        assert dry_run
-        print("[END BENCHMARK]")
-        return None
-
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-
-    server.run_subcommand(benchmark_cmd)
-    server.after_bench()
-
-    with output_path.open("rb") as f:
-        run_data = json.load(f)
-
-    run_data["run_number"] = run_number
-    run_data.update(serve_overrides)
-
-    with output_path.open("w") as f:
-        json.dump(run_data, f, indent=4)
-
-    print("[END BENCHMARK]")
-
-    return run_data
-
-
-def _get_comb_base_path(
-    output_dir: Path,
-    serve_comb: dict[str, object],
-    bench_comb: dict[str, object],
-):
-    return output_dir / "-".join(
-        (
-            "SERVE",
-            *(f"{k}={v}" for k, v in serve_comb.items()),
-            "BENCH",
-            *(f"{k}={v}" for k, v in bench_comb.items()),
-        )
-    ).replace("/", "_").replace("..", "__")  # Sanitize
-
-
-def _get_comb_run_path(base_path: Path, run_number: int | None):
-    if run_number is None:
-        return base_path / "summary.json"
-
-    return base_path / f"run={run_number}.json"
-
-
-def _comb_needs_server(
-    serve_comb: dict[str, object],
-    bench_combs: list[dict[str, object]],
-    output_dir: Path,
-):
-    for bench_comb in bench_combs:
-        base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
-        if not _get_comb_run_path(base_path, run_number=None).exists():
-            return True
-
-    return False
-
-
-def _run_comb(
-    server: ServerWrapper | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: dict[str, object],
-    bench_comb: dict[str, object],
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    comb_data = list[dict[str, object]]()
-
-    for run_number in range(num_runs):
-        run_data = _run_benchmark(
-            server,
-            bench_cmd,
-            serve_overrides=serve_comb,
-            bench_overrides=bench_comb,
-            run_number=run_number,
-            output_path=_get_comb_run_path(base_path, run_number),
-            dry_run=dry_run,
-        )
-
-        if run_data is not None:
-            comb_data.append(run_data)
-
-    if dry_run:
-        return None
-
-    with _get_comb_run_path(base_path, run_number=None).open("w") as f:
-        json.dump(comb_data, f, indent=4)
-
-    return comb_data
-
-
-def run_combs(
-    serve_cmd: list[str],
-    bench_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_params: list[dict[str, object]],
-    bench_params: list[dict[str, object]],
-    output_dir: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    all_data = list[dict[str, object]]()
-    for serve_comb in serve_params:
-        with (
-            _run_server(
-                serve_cmd,
-                after_bench_cmd,
-                show_stdout=show_stdout,
-                serve_overrides=serve_comb,
-                dry_run=dry_run,
-            )
-            if _comb_needs_server(serve_comb, bench_params, output_dir)
-            else contextlib.nullcontext()
-        ) as server:
-            for bench_comb in bench_params:
-                base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
-
-                comb_data = _run_comb(
-                    server,
-                    bench_cmd,
-                    serve_comb=serve_comb,
-                    bench_comb=bench_comb,
-                    base_path=base_path,
-                    num_runs=num_runs,
-                    dry_run=dry_run,
-                )
-
-                if comb_data is not None:
-                    all_data.extend(comb_data)
-
-    if dry_run:
-        return None
-
-    combined_df = pd.DataFrame.from_records(all_data)
-    combined_df.to_csv(output_dir / "summary.csv")
-
-    return combined_df
-
-
-def _get_sla_base_path(
-    output_dir: Path,
-    serve_comb: dict[str, object],
-    bench_comb: dict[str, object],
-):
-    return output_dir / "-".join(
-        (
-            "SERVE",
-            *(f"{k}={v}" for k, v in serve_comb.items()),
-            "BENCH",
-            *(f"{k}={v}" for k, v in bench_comb.items()),
-        )
-    ).replace("/", "_").replace("..", "__")  # Sanitize
-
-
-def _get_sla_iter_path(
-    base_path: Path,
-    sla_comb: dict[str, SLACriterionBase],
-    sla_variable: str,
-    sla_value: int | None,
-):
-    if sla_value is None:
-        prefix = "-".join(v.format_cond(k) for k, v in sla_comb.items())
-        return base_path / f"SLA-{prefix}.json"
-
-    return base_path / f"{sla_variable}={sla_value}"
-
-
-def _get_sla_run_path(iter_path: Path, run_number: int | None):
-    if run_number is None:
-        return iter_path / "summary.json"
-
-    return iter_path / f"run={run_number}.json"
-
-
-def _sla_needs_server(
-    serve_comb: dict[str, object],
-    bench_combs: list[dict[str, object]],
-    sla_combs: list[dict[str, SLACriterionBase]],
-    sla_variable: str,
-    output_dir: Path,
-):
-    for bench_comb in bench_combs:
-        base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
-        for sla_comb in sla_combs:
-            if not _get_sla_iter_path(
-                base_path,
-                sla_comb,
-                sla_variable,
-                sla_value=None,
-            ).exists():
-                return True
-
-    return False
-
-
-def _run_sla(
-    server: ServerWrapper | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: dict[str, object],
-    bench_comb: dict[str, object],
-    iter_path: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    iter_data = list[dict[str, object]]()
-
-    for run_number in range(num_runs):
-        run_data = _run_benchmark(
-            server,
-            bench_cmd,
-            serve_overrides=serve_comb,
-            bench_overrides=bench_comb,
-            run_number=run_number,
-            output_path=_get_sla_run_path(iter_path, run_number),
-            dry_run=dry_run,
-        )
-
-        if run_data is not None:
-            iter_data.append(run_data)
-
-    if dry_run:
-        return None
-
-    with _get_sla_run_path(iter_path, run_number=None).open("w") as f:
-        json.dump(iter_data, f, indent=4)
-
-    return iter_data
-
-
-SLAVariable = Literal["request_rate", "max_concurrency"]
-
-
-def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
-    request_throughput = float(run_data["request_throughput"])  # type: ignore
-    if sla_variable == "request_rate":
-        return request_throughput
-    if sla_variable == "max_concurrency":
-        mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
-        return request_throughput * mean_latency_ms / 1000
-
-    assert_never(sla_variable)
-
-
-def _estimate_sla_bounds(
-    server: ServerWrapper | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: dict[str, object],
-    bench_comb: dict[str, object],
-    sla_comb: dict[str, SLACriterionBase],
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-    sla_variable: SLAVariable,
-    init_value: int,
-    max_value: int,
-):
-    sla_data = list[dict[str, object]]()
-
-    max_passing: int = 0
-    min_failing: int = 0
-
-    val: int = init_value
-    assert val > 0
-
-    while True:
-        print(f"Testing {sla_variable}: {val} req/s")
-
-        iter_data = _run_sla(
-            server,
-            bench_cmd,
-            serve_comb=serve_comb,
-            bench_comb={**bench_comb, sla_variable: val},
-            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
-            num_runs=num_runs,
-            dry_run=dry_run,
-        )
-
-        assert iter_data is not None
-        sla_data.extend(iter_data)
-
-        iter_data_mean = {
-            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
-            for k in sla_comb
-        }
-
-        sla_results = [
-            criterion.print_and_validate(iter_data_mean, k)
-            for k, criterion in sla_comb.items()
-        ]
-
-        if all(sla_results):
-            print("SLA criteria are met.")
-            max_passing = val
-            val *= 2
-        else:
-            print("SLA criteria are not met.")
-            min_failing = val
-            break
-
-        if val >= max_value:
-            break
-
-    return sla_data, (max_passing, min_failing)
-
-
-def _find_sla_value(
-    server: ServerWrapper | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: dict[str, object],
-    bench_comb: dict[str, object],
-    sla_comb: dict[str, SLACriterionBase],
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-    sla_variable: SLAVariable,
-    min_value: int,
-    max_value: int,
-):
-    sla_data = list[dict[str, object]]()
-
-    left: int = min_value
-    right: int = max_value
-
-    while True:
-        val = (left + right) // 2
-        print(f"Testing {sla_variable}: {val} req/s")
-
-        iter_data = _run_sla(
-            server,
-            bench_cmd,
-            serve_comb=serve_comb,
-            bench_comb={**bench_comb, sla_variable: val},
-            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
-            num_runs=num_runs,
-            dry_run=dry_run,
-        )
-
-        assert iter_data is not None
-        sla_data.extend(iter_data)
-
-        iter_data_mean = {
-            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
-            for k in sla_comb
-        }
-
-        sla_results = [
-            criterion.print_and_validate(iter_data_mean, k)
-            for k, criterion in sla_comb.items()
-        ]
-
-        if all(sla_results):
-            print("SLA criteria are met.")
-            left = val
-        else:
-            print("SLA criteria are not met.")
-            right = val
-
-        if right - left <= 1:
-            break
-
-    return sla_data, left
-
-
-def _search_sla(
-    server: ServerWrapper | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: dict[str, object],
-    bench_comb: dict[str, object],
-    sla_comb: dict[str, SLACriterionBase],
-    sla_variable: SLAVariable,
-    sla_inf_value: int = 65536,  # The value that represents infinite QPS
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    print("[SLA START]")
-    print(f"SLA criteria: {', '.join(v.format_cond(k) for k, v in sla_comb.items())}")
-
-    sla_data_0 = _run_sla(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb={**bench_comb, sla_variable: sla_inf_value},
-        iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value),
-        num_runs=num_runs,
-        dry_run=dry_run,
-    )
-    if sla_data_0 is None:
-        assert dry_run
-        print("Omitting SLA search.")
-        print("[SLA END]")
-        return None
-
-    sla_init_value = math.ceil(
-        sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0)
-        / len(sla_data_0)
-    )
-    print(f"Initial {sla_variable} to search: {sla_init_value} req/s.")
-
-    sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb=bench_comb,
-        sla_comb=sla_comb,
-        base_path=base_path,
-        num_runs=num_runs,
-        dry_run=dry_run,
-        sla_variable=sla_variable,
-        init_value=sla_init_value,
-        max_value=sla_inf_value,
-    )
-    print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.")
-
-    sla_data_2, sla_value = _find_sla_value(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb=bench_comb,
-        sla_comb=sla_comb,
-        base_path=base_path,
-        num_runs=num_runs,
-        dry_run=dry_run,
-        sla_variable=sla_variable,
-        min_value=sla_min,
-        max_value=sla_max,
-    )
-
-    sla_data = sla_data_0 + sla_data_1 + sla_data_2
-    print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.")
-
-    with _get_sla_iter_path(
-        base_path,
-        sla_comb,
-        sla_variable,
-        sla_value=None,
-    ).open("w") as f:
-        json.dump(sla_data, f, indent=4)
-
-    print("[SLA END]")
-
-    return sla_data
-
-
-def _plot_throughput_latency_curve(
-    all_data: list[dict[str, object]],
-    serve_combs: list[dict[str, object]],
-    bench_comb: dict[str, object],
-    output_dir: Path,
-):
-    fig_path = output_dir / "-".join(
-        (
-            "BENCH",
-            *(f"{k}={v}" for k, v in bench_comb.items()),
-        )
-    ).replace("/", "_").replace("..", "__")  # Sanitize
-
-    df = pd.DataFrame.from_records(
-        [item for item in all_data if all(item[k] == bench_comb[k] for k in bench_comb)]
-    )
-
-    # Group together points with similar throughput
-    df["request_throughput"] = df["request_throughput"].round()
-
-    # Preserve the key order using dictionary
-    all_comb_keys = {k: None for comb in serve_combs for k in comb}
-    for k in all_comb_keys:
-        df[k] = df[k].astype(str)
-
-    keys_per_comb = [comb.keys() for comb in serve_combs]
-    if (
-        all(ks == keys_per_comb[0] for ks in keys_per_comb)
-        and len(keys_per_comb[0]) <= 3
-    ):
-        hue, style, size, *_ = (*keys_per_comb[0], None, None)
-        ax = sns.lineplot(
-            df,
-            x="request_throughput",
-            y="p99_e2el_ms",
-            hue=hue,
-            style=style,
-            size=size,
-            markers=True,
-        )
-    else:
-        df["category"] = df[list(all_comb_keys)].agg("-".join, axis=1)
-        ax = sns.lineplot(
-            df,
-            x="request_throughput",
-            y="p99_e2el_ms",
-            hue="category",
-            markers=True,
-        )
-
-    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
-
-    fig = ax.get_figure()
-    assert fig is not None
-
-    fig.tight_layout()
-    fig.savefig(fig_path)
-
-
-def _plot_throughput_latency_curves(
-    all_data: list[dict[str, object]],
-    serve_combs: list[dict[str, object]],
-    bench_combs: list[dict[str, object]],
-    output_dir: Path,
-):
-    for bench_comb in bench_combs:
-        _plot_throughput_latency_curve(all_data, serve_combs, bench_comb, output_dir)
-
-
-def run_slas(
-    serve_cmd: list[str],
-    bench_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_params: list[dict[str, object]],
-    bench_params: list[dict[str, object]],
-    sla_params: list[dict[str, SLACriterionBase]],
-    sla_variable: SLAVariable,
-    output_dir: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    if any(
-        k in bench_comb
-        for bench_comb in bench_params
-        for k in _iter_param_key_candidates(sla_variable)
-    ):
-        raise ValueError(
-            f"You should not override `{sla_variable}` in `bench_params` in SLA mode, "
-            "since it is supposed to be determined automatically."
-        )
-
-    all_data = list[dict[str, object]]()
-    for serve_comb in serve_params:
-        with (
-            _run_server(
-                serve_cmd,
-                after_bench_cmd,
-                show_stdout=show_stdout,
-                serve_overrides=serve_comb,
-                dry_run=dry_run,
-            )
-            if _sla_needs_server(
-                serve_comb,
-                bench_params,
-                sla_params,
-                sla_variable,
-                output_dir,
-            )
-            else contextlib.nullcontext()
-        ) as server:
-            for bench_comb in bench_params:
-                for sla_comb in sla_params:
-                    base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
-
-                    comb_data = _search_sla(
-                        server,
-                        bench_cmd,
-                        serve_comb=serve_comb,
-                        bench_comb=bench_comb,
-                        sla_comb=sla_comb,
-                        sla_variable=sla_variable,
-                        base_path=base_path,
-                        num_runs=num_runs,
-                        dry_run=dry_run,
-                    )
-
-                    if comb_data is not None:
-                        all_data.extend(comb_data)
-
-    if dry_run:
-        return None
-
-    combined_df = pd.DataFrame.from_records(all_data)
-    combined_df.to_csv(output_dir / "summary.csv")
-
-    _plot_throughput_latency_curves(all_data, serve_params, bench_params, output_dir)
-
-    return combined_df
-
-
-def _run_main(
-    serve_cmd: list[str],
-    bench_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_params: list[dict[str, object]],
-    bench_params: list[dict[str, object]],
-    sla_params: list[dict[str, SLACriterionBase]],
-    sla_variable: SLAVariable,
-    output_dir: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    if sla_params:
-        return run_slas(
-            serve_cmd=serve_cmd,
-            bench_cmd=bench_cmd,
-            after_bench_cmd=after_bench_cmd,
-            show_stdout=show_stdout,
-            serve_params=serve_params,
-            bench_params=bench_params,
-            sla_params=sla_params,
-            sla_variable=sla_variable,
-            output_dir=output_dir,
-            num_runs=num_runs,
-            dry_run=dry_run,
-        )
-
-    return run_combs(
-        serve_cmd=serve_cmd,
-        bench_cmd=bench_cmd,
-        after_bench_cmd=after_bench_cmd,
-        show_stdout=show_stdout,
-        serve_params=serve_params,
-        bench_params=bench_params,
-        output_dir=output_dir,
-        num_runs=num_runs,
-        dry_run=dry_run,
-    )
-
-
-def run_main(
-    serve_cmd: list[str],
-    bench_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_params: list[dict[str, object]],
-    bench_params: list[dict[str, object]],
-    sla_params: list[dict[str, SLACriterionBase]],
-    sla_variable: SLAVariable,
-    output_dir: Path,
-    num_runs: int,
-    dry_run: bool,
-    resume: str | None,
-):
-    timestamp = resume or datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_dir = output_dir / timestamp
-
-    if resume and not output_dir.exists():
-        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
-
-    try:
-        return _run_main(
-            serve_cmd=serve_cmd,
-            bench_cmd=bench_cmd,
-            after_bench_cmd=after_bench_cmd,
-            show_stdout=show_stdout,
-            serve_params=serve_params,
-            bench_params=bench_params,
-            sla_params=sla_params,
-            sla_variable=sla_variable,
-            output_dir=output_dir,
-            num_runs=num_runs,
-            dry_run=dry_run,
-        )
-    except BaseException as exc:
-        raise RuntimeError(
-            f"The script was terminated early. Use `--resume {timestamp}` "
-            f"to continue the script from its last checkpoint."
-        ) from exc
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run vLLM server benchmark on a parameter grid of settings."
-    )
-    parser.add_argument(
-        "--serve-cmd",
-        type=str,
-        required=True,
-        help="The command used to run the server: `vllm serve ...`",
-    )
-    parser.add_argument(
-        "--bench-cmd",
-        type=str,
-        required=True,
-        help="The command used to run the benchmark: `vllm bench serve ...`",
-    )
-    parser.add_argument(
-        "--after-bench-cmd",
-        type=str,
-        default=None,
-        help="After a benchmark run is complete, invoke this command instead of the "
-        "default `ServerWrapper.clear_cache()`.",
-    )
-    parser.add_argument(
-        "--show-stdout",
-        action="store_true",
-        help="If set, logs the standard output of subcommands. "
-        "Useful for debugging but can be quite spammy.",
-    )
-    parser.add_argument(
-        "--serve-params",
-        type=str,
-        default=None,
-        help="Path to JSON file containing a list of parameter combinations "
-        "for the `vllm serve` command. "
-        "If both `serve_params` and `bench_params` are given, "
-        "this script will iterate over their Cartesian product.",
-    )
-    parser.add_argument(
-        "--bench-params",
-        type=str,
-        default=None,
-        help="Path to JSON file containing a list of parameter combinations "
-        "for the `vllm bench serve` command. "
-        "If both `serve_params` and `bench_params` are given, "
-        "this script will iterate over their Cartesian product.",
-    )
-    parser.add_argument(
-        "--sla-params",
-        type=str,
-        default=None,
-        help="Path to JSON file containing a list of SLA constraints to satisfy. "
-        'Each constraint is expressed in `{"<KEY>": "<OP><VALUE>"}` format, '
-        'e.g.: `{"p99_e2el_ms": "<=500"}` means that '
-        "the E2E latency should be less than 500ms 99% of the time. "
-        "Setting this option runs this script in SLA mode, which searches for the "
-        "maximum `sla_variable` that satisfies the constraints for each combination "
-        "of `serve_params`, `bench_params`, and `sla_params`.",
-    )
-    parser.add_argument(
-        "--sla-variable",
-        type=str,
-        choices=get_args(SLAVariable),
-        default="request_rate",
-        help="Whether to tune request rate or maximum concurrency to satisfy "
-        "the SLA constraints.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output-dir",
-        type=str,
-        default="results",
-        help="The directory to which results are written.",
-    )
-    parser.add_argument(
-        "--num-runs",
-        type=int,
-        default=3,
-        help="Number of runs per parameter combination.",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="If set, prints the commands to run then exits without running them.",
-    )
-    parser.add_argument(
-        "--resume",
-        type=str,
-        default=None,
-        help="Set this to the name of a directory under `output_dir` (which is a "
-        "timestamp) to resume a previous execution of this script, i.e., only run "
-        "parameter combinations for which there are still no output files.",
-    )
-
-    args = parser.parse_args()
-
-    serve_cmd = shlex.split(args.serve_cmd)
-    bench_cmd = shlex.split(args.bench_cmd)
-    after_bench_cmd = (
-        [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd)
-    )
-
-    serve_params: list[dict[str, object]]
-    if args.serve_params:
-        with open(args.serve_params, "rb") as f:
-            serve_params = _parse_params(json.load(f))
-    else:
-        # i.e.: run serve_cmd without any modification
-        serve_params = [{}]
-
-    bench_params: list[dict[str, object]]
-    if args.bench_params:
-        with open(args.bench_params, "rb") as f:
-            bench_params = _parse_params(json.load(f))
-    else:
-        # i.e.: run bench_cmd without any modification
-        bench_params = [{}]
-
-    sla_params: list[dict[str, SLACriterionBase]]
-    if args.sla_params:
-        with open(args.sla_params, "rb") as f:
-            sla_params = _parse_sla(json.load(f))
-    else:
-        sla_params = []
-
-    num_runs = args.num_runs
-    if num_runs < 1:
-        raise ValueError("`num_runs` should be at least 1.")
-
-    run_main(
-        serve_cmd=serve_cmd,
-        bench_cmd=bench_cmd,
-        after_bench_cmd=after_bench_cmd,
-        show_stdout=args.show_stdout,
-        serve_params=serve_params,
-        bench_params=bench_params,
-        sla_params=sla_params,
-        sla_variable=args.sla_variable,
-        output_dir=Path(args.output_dir),
-        num_runs=num_runs,
-        dry_run=args.dry_run,
-        resume=args.resume,
-    )
-
-
-if __name__ == "__main__":
-    main()

From ae9d02133ff163e9b61945d738a08385a190a0cf Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:49:43 +0000
Subject: [PATCH 28/48] Update import

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 53887c82ee49..8637c9dfdec1 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -13,7 +13,7 @@
 import seaborn as sns
 from typing_extensions import Self, override
 
-from vllm.utils.collections import full_groupby
+from vllm.utils.collection_utils import full_groupby
 
 
 class PlotFilterBase(ABC):

From beb3854be6bc08e57ca535c8f979b7cf31ee3e9d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 15:57:36 +0000
Subject: [PATCH 29/48] Clean up

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/benchmarks.md      |  6 +++---
 vllm/benchmarks/sweep/param_sweep.py |  2 +-
 vllm/benchmarks/sweep/plot.py        | 20 ++++++++----------
 vllm/benchmarks/sweep/serve.py       | 31 ++++++++++++++--------------
 vllm/benchmarks/sweep/utils.py       |  4 ++++
 5 files changed, 32 insertions(+), 31 deletions(-)
 create mode 100644 vllm/benchmarks/sweep/utils.py

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 6ff475422e9f..225eb73c142f 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -996,7 +996,7 @@ The basic purpose of this script is to evaluate vLLM under different settings. F
 Example command:
 
 ```bash
-python vllm/benchmarks/sweep/serve.py \
+python -m vllm.benchmarks.sweep.serve \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
@@ -1044,7 +1044,7 @@ For example, to ensure E2E latency within different target values for 99% of req
 Example command:
 
 ```bash
-python vllm/benchmarks/sweep/serve.py \
+python -m vllm.benchmarks.sweep.serve \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
@@ -1073,7 +1073,7 @@ The algorithm for adjusting the SLA variable is as follows:
 Example command:
 
 ```bash
-python vllm/benchmarks/sweep/plot.py benchmarks/results/<timestamp> \
+python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
     --var-x max_concurrency \
     --curve-by api_server_count,max_num_batched_tokens \
     --row-by random_input_len \
diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py
index 90006e02ba6a..986561ed8502 100644
--- a/vllm/benchmarks/sweep/param_sweep.py
+++ b/vllm/benchmarks/sweep/param_sweep.py
@@ -36,7 +36,7 @@ def from_record(cls, record: dict[str, object]):
         return cls(record)
 
     def __or__(self, other: dict[str, Any]):
-        return type(self)(self | other)
+        return type(self)(super().__or__(other))
 
     # In JSON, we prefer "_"
     def _iter_param_key_candidates(self, param_key: str):
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 8637c9dfdec1..35e0c7e88e5a 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -15,6 +15,8 @@
 
 from vllm.utils.collection_utils import full_groupby
 
+from .utils import sanitize_filename
+
 
 class PlotFilterBase(ABC):
     @classmethod
@@ -152,17 +154,13 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]):
 
 
 def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]):
-    return fig_dir / (
-        "-".join(
-            (
-                "FIGURE" + ("-" if group else ""),
-                *(f"{k}={v}" for k, v in group),
-            )
-        )
-        .replace("/", "_")
-        .replace("..", "__")  # Sanitize
-        + ".png"
-    )
+    parts = list[str]()
+    if group:
+        parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group)))
+    else:
+        parts.append("figure")
+
+    return fig_dir / sanitize_filename("-".join(parts) + ".png")
 
 
 class DummyExecutor:
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 5599a5dbc78b..cc64b3af6230 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -15,6 +15,7 @@
 from .param_sweep import ParameterSweep, ParameterSweepItem
 from .server import ServerProcess
 from .sla_sweep import SLASweep, SLASweepItem
+from .utils import sanitize_filename
 
 
 @contextlib.contextmanager
@@ -106,14 +107,13 @@ def _get_comb_base_path(
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
 ):
-    return output_dir / "-".join(
-        (
-            "SERVE" + ("-" if serve_comb else ""),
-            serve_comb.as_text(sep="-"),
-            "BENCH" + ("-" if bench_comb else ""),
-            bench_comb.as_text(sep="-"),
-        )
-    ).replace("/", "_").replace("..", "__")  # Sanitize
+    parts = list[str]()
+    if serve_comb:
+        parts.extend(("SERVE-", *serve_comb.as_text(sep="-")))
+    if bench_comb:
+        parts.extend(("BENCH-", *bench_comb.as_text(sep="-")))
+
+    return output_dir / sanitize_filename("-".join(parts))
 
 
 def _get_comb_run_path(base_path: Path, run_number: int | None):
@@ -226,14 +226,13 @@ def _get_sla_base_path(
     serve_comb: ParameterSweepItem,
     bench_comb: ParameterSweepItem,
 ):
-    return output_dir / "-".join(
-        (
-            "SERVE" + ("-" if serve_comb else ""),
-            serve_comb.as_text(sep="-"),
-            "BENCH" + ("-" if bench_comb else ""),
-            bench_comb.as_text(sep="-"),
-        )
-    ).replace("/", "_").replace("..", "__")  # Sanitize
+    parts = list[str]()
+    if serve_comb:
+        parts.extend(("SERVE-", *serve_comb.as_text(sep="-")))
+    if bench_comb:
+        parts.extend(("BENCH-", *bench_comb.as_text(sep="-")))
+
+    return output_dir / sanitize_filename("-".join(parts))
 
 
 def _get_sla_iter_path(
diff --git a/vllm/benchmarks/sweep/utils.py b/vllm/benchmarks/sweep/utils.py
new file mode 100644
index 000000000000..5a9e7d932b59
--- /dev/null
+++ b/vllm/benchmarks/sweep/utils.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+def sanitize_filename(filename: str) -> str:
+    return filename.replace("/", "_").replace("..", "__")

From 5f36c62774e701bcc00046a7a4031bf5f2cc4e3c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:01:22 +0000
Subject: [PATCH 30/48] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/serve.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index cc64b3af6230..fb966ce41ab9 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -109,9 +109,9 @@ def _get_comb_base_path(
 ):
     parts = list[str]()
     if serve_comb:
-        parts.extend(("SERVE-", *serve_comb.as_text(sep="-")))
+        parts.extend(("SERVE-", serve_comb.as_text(sep="-").split("-")))
     if bench_comb:
-        parts.extend(("BENCH-", *bench_comb.as_text(sep="-")))
+        parts.extend(("BENCH-", bench_comb.as_text(sep="-").split("-")))
 
     return output_dir / sanitize_filename("-".join(parts))
 
@@ -228,9 +228,9 @@ def _get_sla_base_path(
 ):
     parts = list[str]()
     if serve_comb:
-        parts.extend(("SERVE-", *serve_comb.as_text(sep="-")))
+        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
     if bench_comb:
-        parts.extend(("BENCH-", *bench_comb.as_text(sep="-")))
+        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
 
     return output_dir / sanitize_filename("-".join(parts))
 

From 8fbfd49f95e8c8ceda4d631db116efde0bc69817 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:11:13 +0000
Subject: [PATCH 31/48] Simplify

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 35e0c7e88e5a..f8cba47bfc37 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -373,9 +373,9 @@ def main():
     parser.add_argument(
         "--fig-dir",
         type=str,
-        default=None,
-        help="The directory to save the figures. "
-        "By default, this is set to `OUTPUT_DIR`.",
+        default="",
+        help="The directory to save the figures, relative to `OUTPUT_DIR`. "
+        "By default, the same directory is used.",
     )
     parser.add_argument(
         "--curve-by",
@@ -459,14 +459,18 @@ def main():
 
     args = parser.parse_args()
 
+    output_dir = Path(args.OUTPUT_DIR)
+    if not output_dir.exists():
+        raise ValueError(f"No parameter sweep results under {output_dir}")
+
     curve_by = [] if not args.curve_by else args.curve_by.split(",")
     row_by = [] if not args.row_by else args.row_by.split(",")
     col_by = [] if not args.col_by else args.col_by.split(",")
     fig_by = [] if not args.fig_by else args.fig_by.split(",")
 
     plot(
-        output_dir=Path(args.OUTPUT_DIR),
-        fig_dir=Path(args.fig_dir or args.OUTPUT_DIR),
+        output_dir=output_dir,
+        fig_dir=output_dir / args.fig_dir,
         fig_by=fig_by,
         row_by=row_by,
         col_by=col_by,

From daee7a84f34dc670ba4385f51d9b97e5635a484c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:17:50 +0000
Subject: [PATCH 32/48] Fix legend

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index f8cba47bfc37..f31f6d685d82 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -278,6 +278,8 @@ def _plot_fig(
             size=size,
             markers=True,
         )
+
+        g.add_legend(title=hue)
     else:
         df["curve_group"] = (
             pd.concat(
@@ -296,7 +298,7 @@ def _plot_fig(
             markers=True,
         )
 
-    g.add_legend()
+        g.add_legend()
 
     g.savefig(fig_path)
     plt.close(g.figure)

From b9e08ffe0057249086fdc0e5a9d38c105ee4ba80 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:22:38 +0000
Subject: [PATCH 33/48] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index f31f6d685d82..7c9678c21e10 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -448,7 +448,8 @@ def main():
     )
     parser.add_argument(
         "--scale-y",
-        action="store_true",
+        type=str,
+        default=None,
         help="The scale to use for the y-axis. "
         "Currently only accepts string values such as 'log' and 'sqrt'. "
         "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html",

From ad4149b12693fe03152c909f3a73aeed6e57aa64 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:26:27 +0000
Subject: [PATCH 34/48] Reword

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py  | 3 ++-
 vllm/benchmarks/sweep/serve.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 7c9678c21e10..691d2123dbab 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -457,7 +457,8 @@ def main():
     parser.add_argument(
         "--dry-run",
         action="store_true",
-        help="If set, prints the location of the figures without drawing them.",
+        help="If set, prints the information about each figure to plot, "
+        "then exits without drawing them.",
     )
 
     args = parser.parse_args()
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index fb966ce41ab9..c1fad83fdfb2 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -763,7 +763,7 @@ def main():
     parser.add_argument(
         "--dry-run",
         action="store_true",
-        help="If set, prints the commands to run then exits without running them.",
+        help="If set, prints the commands to run, then exits without executing them.",
     )
     parser.add_argument(
         "--resume",

From c5eaf789c1dce76bb46e6df82dfa7b4724f450c1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:28:07 +0000
Subject: [PATCH 35/48] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 691d2123dbab..82ae3b294a45 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -118,7 +118,7 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
 
 
 PLOT_BINNERS: dict[str, type[PlotBinner]] = {
-    "@": PlotBinner,
+    "%": PlotBinner,
 }
 
 

From c7426c254c6aa80cfea14028cacde887752ea0ad Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:29:57 +0000
Subject: [PATCH 36/48] Reword

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/benchmarks.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 225eb73c142f..f119a03d28dc 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1082,7 +1082,7 @@ python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
 ```
 
 !!! tip
-    You can use `--dry-run` to preview the commands to be run.
+    You can use `--dry-run` to preview the figures to be plotted.
 
 ## Performance Benchmarks
 

From 49deaab208425bf7504cf0bb329008812b25110e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 16:32:16 +0000
Subject: [PATCH 37/48] Reorder

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/benchmarks.md |  2 +-
 vllm/benchmarks/sweep/plot.py   | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index f119a03d28dc..99ca3c5dd234 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1075,9 +1075,9 @@ Example command:
 ```bash
 python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
     --var-x max_concurrency \
-    --curve-by api_server_count,max_num_batched_tokens \
     --row-by random_input_len \
     --col-by random_output_len \
+    --curve-by api_server_count,max_num_batched_tokens \
     --filter-by 'max_concurrency<=1024'
 ```
 
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 82ae3b294a45..eb8d997ab1da 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -380,31 +380,31 @@ def main():
         "By default, the same directory is used.",
     )
     parser.add_argument(
-        "--curve-by",
+        "--fig-by",
         type=str,
-        default=None,
-        help="A comma-separated list of variables, such that a separate curve "
+        default="",
+        help="A comma-separated list of variables, such that a separate figure "
         "is created for each combination of these variables.",
     )
     parser.add_argument(
-        "--col-by",
+        "--row-by",
         type=str,
         default="",
-        help="A comma-separated list of variables, such that a separate column "
+        help="A comma-separated list of variables, such that a separate row "
         "is created for each combination of these variables.",
     )
     parser.add_argument(
-        "--row-by",
+        "--col-by",
         type=str,
         default="",
-        help="A comma-separated list of variables, such that a separate row "
+        help="A comma-separated list of variables, such that a separate column "
         "is created for each combination of these variables.",
     )
     parser.add_argument(
-        "--fig-by",
+        "--curve-by",
         type=str,
-        default="",
-        help="A comma-separated list of variables, such that a separate figure "
+        default=None,
+        help="A comma-separated list of variables, such that a separate curve "
         "is created for each combination of these variables.",
     )
     parser.add_argument(

From 0a4eb3693b61abef79c0446b3bd16eb4e28b31f7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 20 Oct 2025 17:37:36 +0000
Subject: [PATCH 38/48] Informative error

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/serve.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index c1fad83fdfb2..49014738911f 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -79,7 +79,9 @@ def _run_benchmark(
             return run_data
 
     if server is None:
-        assert dry_run
+        if not dry_run:
+            raise ValueError(f"Cannot find results at {output_path}")
+
         print("[END BENCHMARK]")
         return None
 

From 8afa4d3f69f7b770414bbaf1e6a3589d48d02e0a Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 01:46:01 +0000
Subject: [PATCH 39/48] Separate out SLA tuner

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/contributing/benchmarks.md    |  12 +-
 vllm/benchmarks/sweep/plot.py      |  14 +-
 vllm/benchmarks/sweep/serve.py     | 712 ++++++-----------------------
 vllm/benchmarks/sweep/serve_sla.py | 483 +++++++++++++++++++
 4 files changed, 632 insertions(+), 589 deletions(-)
 create mode 100644 vllm/benchmarks/sweep/serve_sla.py

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 99ca3c5dd234..89524ed3bc63 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -929,11 +929,9 @@ throughput numbers correctly is also adjusted.
 
 ### Online Benchmark
 
-[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` over multiple configurations.
+[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
 
-#### Batch Mode
-
-The basic purpose of this script is to evaluate vLLM under different settings. Follows these steps to run the script:
+Follow these steps to run the script:
 
 1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
 2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
@@ -1018,9 +1016,9 @@ python -m vllm.benchmarks.sweep.serve \
 !!! tip
     You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
   
-#### SLA Mode
+### SLA Auto-Tuner
 
-By passing SLA constraints via `--sla-params`, you can run this script in SLA mode, causing it to adjust either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints.
+[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
 
 For example, to ensure E2E latency within different target values for 99% of requests:
 
@@ -1044,7 +1042,7 @@ For example, to ensure E2E latency within different target values for 99% of req
 Example command:
 
 ```bash
-python -m vllm.benchmarks.sweep.serve \
+python -m vllm.benchmarks.sweep.serve_sla \
     --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
     --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
     --serve-params benchmarks/serve_hparams.json \
diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index eb8d997ab1da..8cdbe5980e8f 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -361,10 +361,7 @@ def plot(
         )
 
 
-def main():
-    parser = argparse.ArgumentParser(
-        description="Plot performance curves from parameter sweep results."
-    )
+def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "OUTPUT_DIR",
         type=str,
@@ -461,8 +458,8 @@ def main():
         "then exits without drawing them.",
     )
 
-    args = parser.parse_args()
 
+def main(args: argparse.Namespace):
     output_dir = Path(args.OUTPUT_DIR)
     if not output_dir.exists():
         raise ValueError(f"No parameter sweep results under {output_dir}")
@@ -490,4 +487,9 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(
+        description="Plot performance curves from parameter sweep results."
+    )
+    add_cli_args(parser)
+
+    main(parser.parse_args())
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 49014738911f..e99052247e71 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -3,23 +3,20 @@
 import argparse
 import contextlib
 import json
-import math
 import shlex
+from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
-from typing import Literal, get_args
 
 import pandas as pd
-from typing_extensions import assert_never
 
 from .param_sweep import ParameterSweep, ParameterSweepItem
 from .server import ServerProcess
-from .sla_sweep import SLASweep, SLASweepItem
 from .utils import sanitize_filename
 
 
 @contextlib.contextmanager
-def _run_server(
+def run_server(
     serve_cmd: list[str],
     after_bench_cmd: list[str],
     *,
@@ -44,7 +41,7 @@ def _run_server(
     print("[END SERVER]")
 
 
-def _run_benchmark(
+def run_benchmark(
     server: ServerProcess | None,
     bench_cmd: list[str],
     *,
@@ -138,7 +135,7 @@ def _comb_needs_server(
     return False
 
 
-def _run_comb(
+def run_comb(
     server: ServerProcess | None,
     bench_cmd: list[str],
     *,
@@ -151,7 +148,7 @@ def _run_comb(
     comb_data = list[dict[str, object]]()
 
     for run_number in range(num_runs):
-        run_data = _run_benchmark(
+        run_data = run_benchmark(
             server,
             bench_cmd,
             serve_overrides=serve_comb,
@@ -188,7 +185,7 @@ def run_combs(
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
         with (
-            _run_server(
+            run_server(
                 serve_cmd,
                 after_bench_cmd,
                 show_stdout=show_stdout,
@@ -201,7 +198,7 @@ def run_combs(
             for bench_comb in bench_params:
                 base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
 
-                comb_data = _run_comb(
+                comb_data = run_comb(
                     server,
                     bench_cmd,
                     serve_comb=serve_comb,
@@ -223,457 +220,150 @@ def run_combs(
     return combined_df
 
 
-def _get_sla_base_path(
-    output_dir: Path,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-):
-    parts = list[str]()
-    if serve_comb:
-        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
-    if bench_comb:
-        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
-
-    return output_dir / sanitize_filename("-".join(parts))
-
-
-def _get_sla_iter_path(
-    base_path: Path,
-    sla_comb: SLASweepItem,
-    sla_variable: str,
-    sla_value: int | None,
-):
-    if sla_value is None:
-        prefix = sla_comb.as_text(sep="-")
-        return base_path / f"SLA--{prefix}.json"
-
-    return base_path / f"{sla_variable}={sla_value}"
-
-
-def _get_sla_run_path(iter_path: Path, run_number: int | None):
-    if run_number is None:
-        return iter_path / "summary.json"
-
-    return iter_path / f"run={run_number}.json"
-
-
-def _sla_needs_server(
-    serve_comb: ParameterSweepItem,
-    bench_combs: ParameterSweep,
-    sla_combs: SLASweep,
-    sla_variable: str,
-    output_dir: Path,
-):
-    for bench_comb in bench_combs:
-        base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
-        for sla_comb in sla_combs:
-            if not _get_sla_iter_path(
-                base_path,
-                sla_comb,
-                sla_variable,
-                sla_value=None,
-            ).exists():
-                return True
-
-    return False
-
-
-def _run_sla(
-    server: ServerProcess | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-    iter_path: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    iter_data = list[dict[str, object]]()
-
-    for run_number in range(num_runs):
-        run_data = _run_benchmark(
-            server,
-            bench_cmd,
-            serve_overrides=serve_comb,
-            bench_overrides=bench_comb,
-            run_number=run_number,
-            output_path=_get_sla_run_path(iter_path, run_number),
-            dry_run=dry_run,
-        )
-
-        if run_data is not None:
-            iter_data.append(run_data)
-
-    if dry_run:
-        return None
-
-    with _get_sla_run_path(iter_path, run_number=None).open("w") as f:
-        json.dump(iter_data, f, indent=4)
-
-    return iter_data
-
-
-SLAVariable = Literal["request_rate", "max_concurrency"]
-
-
-def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
-    request_throughput = float(run_data["request_throughput"])  # type: ignore
-    if sla_variable == "request_rate":
-        return request_throughput
-    if sla_variable == "max_concurrency":
-        mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
-        return request_throughput * mean_latency_ms / 1000
-
-    assert_never(sla_variable)
-
-
-def _estimate_sla_bounds(
-    server: ServerProcess | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-    sla_comb: SLASweepItem,
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-    sla_variable: SLAVariable,
-    init_value: int,
-    max_value: int,
-):
-    sla_data = list[dict[str, object]]()
-
-    max_passing: int = 0
-    min_failing: int = 0
-
-    val: int = init_value
-    assert val > 0
-
-    while True:
-        print(f"Testing {sla_variable}: {val} req/s")
-
-        iter_data = _run_sla(
-            server,
-            bench_cmd,
-            serve_comb=serve_comb,
-            bench_comb=bench_comb | {sla_variable: val},
-            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
-            num_runs=num_runs,
-            dry_run=dry_run,
+@dataclass
+class SweepServeArgs:
+    serve_cmd: list[str]
+    bench_cmd: list[str]
+    after_bench_cmd: list[str]
+    show_stdout: bool
+    serve_params: ParameterSweep
+    bench_params: ParameterSweep
+    output_dir: Path
+    num_runs: int
+    dry_run: bool
+    resume: str | None
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        serve_cmd = shlex.split(args.serve_cmd)
+        bench_cmd = shlex.split(args.bench_cmd)
+        after_bench_cmd = (
+            [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd)
         )
 
-        assert iter_data is not None
-        sla_data.extend(iter_data)
-
-        iter_data_mean = {
-            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
-            for k in sla_comb
-        }
-
-        sla_results = [
-            criterion.print_and_validate(iter_data_mean, k)
-            for k, criterion in sla_comb.items()
-        ]
-
-        if all(sla_results):
-            print("SLA criteria are met.")
-            max_passing = val
-            val *= 2
+        if args.serve_params:
+            serve_params = ParameterSweep.read_json(args.serve_params)
         else:
-            print("SLA criteria are not met.")
-            min_failing = val
-            break
-
-        if val >= max_value:
-            break
-
-    return sla_data, (max_passing, min_failing)
-
+            # i.e.: run serve_cmd without any modification
+            serve_params = ParameterSweep.from_records([{}])
 
-def _find_sla_value(
-    server: ServerProcess | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-    sla_comb: SLASweepItem,
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-    sla_variable: SLAVariable,
-    min_value: int,
-    max_value: int,
-):
-    sla_data = list[dict[str, object]]()
-
-    left: int = min_value
-    right: int = max_value
-
-    while True:
-        val = (left + right) // 2
-        print(f"Testing {sla_variable}: {val} req/s")
-
-        iter_data = _run_sla(
-            server,
-            bench_cmd,
-            serve_comb=serve_comb,
-            bench_comb=bench_comb | {sla_variable: val},
-            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
-            num_runs=num_runs,
-            dry_run=dry_run,
-        )
-
-        assert iter_data is not None
-        sla_data.extend(iter_data)
-
-        iter_data_mean = {
-            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
-            for k in sla_comb
-        }
-
-        sla_results = [
-            criterion.print_and_validate(iter_data_mean, k)
-            for k, criterion in sla_comb.items()
-        ]
-
-        if all(sla_results):
-            print("SLA criteria are met.")
-            left = val
+        if args.bench_params:
+            bench_params = ParameterSweep.read_json(args.bench_params)
         else:
-            print("SLA criteria are not met.")
-            right = val
-
-        if right - left <= 1:
-            break
-
-    return sla_data, left
-
-
-def _search_sla(
-    server: ServerProcess | None,
-    bench_cmd: list[str],
-    *,
-    serve_comb: ParameterSweepItem,
-    bench_comb: ParameterSweepItem,
-    sla_comb: SLASweepItem,
-    sla_variable: SLAVariable,
-    sla_inf_value: int = 65536,  # The value that represents infinite QPS
-    base_path: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    print("[SLA START]")
-    print(f"SLA criteria: {sla_comb.as_text()}")
-
-    sla_data_0 = _run_sla(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb=bench_comb | {sla_variable: sla_inf_value},
-        iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value),
-        num_runs=num_runs,
-        dry_run=dry_run,
-    )
-    if sla_data_0 is None:
-        assert dry_run
-        print("Omitting SLA search.")
-        print("[SLA END]")
-        return None
+            # i.e.: run bench_cmd without any modification
+            bench_params = ParameterSweep.from_records([{}])
 
-    sla_init_value = math.ceil(
-        sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0)
-        / len(sla_data_0)
-    )
-    print(f"Initial {sla_variable} to search: {sla_init_value} req/s.")
-
-    sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb=bench_comb,
-        sla_comb=sla_comb,
-        base_path=base_path,
-        num_runs=num_runs,
-        dry_run=dry_run,
-        sla_variable=sla_variable,
-        init_value=sla_init_value,
-        max_value=sla_inf_value,
-    )
-    print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.")
-
-    sla_data_2, sla_value = _find_sla_value(
-        server,
-        bench_cmd,
-        serve_comb=serve_comb,
-        bench_comb=bench_comb,
-        sla_comb=sla_comb,
-        base_path=base_path,
-        num_runs=num_runs,
-        dry_run=dry_run,
-        sla_variable=sla_variable,
-        min_value=sla_min,
-        max_value=sla_max,
-    )
+        num_runs = args.num_runs
+        if num_runs < 1:
+            raise ValueError("`num_runs` should be at least 1.")
 
-    sla_data = sla_data_0 + sla_data_1 + sla_data_2
-    print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.")
-
-    with _get_sla_iter_path(
-        base_path,
-        sla_comb,
-        sla_variable,
-        sla_value=None,
-    ).open("w") as f:
-        json.dump(sla_data, f, indent=4)
-
-    print("[SLA END]")
-
-    return sla_data
-
-
-def run_slas(
-    serve_cmd: list[str],
-    bench_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_params: ParameterSweep,
-    bench_params: ParameterSweep,
-    sla_params: SLASweep,
-    sla_variable: SLAVariable,
-    output_dir: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params):
-        raise ValueError(
-            f"You should not override `{sla_variable}` in `bench_params` in SLA mode, "
-            "since it is supposed to be determined automatically."
-        )
-
-    all_data = list[dict[str, object]]()
-    for serve_comb in serve_params:
-        with (
-            _run_server(
-                serve_cmd,
-                after_bench_cmd,
-                show_stdout=show_stdout,
-                serve_overrides=serve_comb,
-                dry_run=dry_run,
-            )
-            if _sla_needs_server(
-                serve_comb,
-                bench_params,
-                sla_params,
-                sla_variable,
-                output_dir,
-            )
-            else contextlib.nullcontext()
-        ) as server:
-            for bench_comb in bench_params:
-                for sla_comb in sla_params:
-                    base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
-
-                    comb_data = _search_sla(
-                        server,
-                        bench_cmd,
-                        serve_comb=serve_comb,
-                        bench_comb=bench_comb,
-                        sla_comb=sla_comb,
-                        sla_variable=sla_variable,
-                        base_path=base_path,
-                        num_runs=num_runs,
-                        dry_run=dry_run,
-                    )
-
-                    if comb_data is not None:
-                        all_data.extend(comb_data)
-
-    if dry_run:
-        return None
-
-    combined_df = pd.DataFrame.from_records(all_data)
-    combined_df.to_csv(output_dir / "summary.csv")
-
-    return combined_df
-
-
-def _run_main(
-    serve_cmd: list[str],
-    bench_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_params: ParameterSweep,
-    bench_params: ParameterSweep,
-    sla_params: SLASweep,
-    sla_variable: SLAVariable,
-    output_dir: Path,
-    num_runs: int,
-    dry_run: bool,
-):
-    if sla_params:
-        return run_slas(
+        return cls(
             serve_cmd=serve_cmd,
             bench_cmd=bench_cmd,
             after_bench_cmd=after_bench_cmd,
-            show_stdout=show_stdout,
+            show_stdout=args.show_stdout,
             serve_params=serve_params,
             bench_params=bench_params,
-            sla_params=sla_params,
-            sla_variable=sla_variable,
-            output_dir=output_dir,
+            output_dir=Path(args.output_dir),
             num_runs=num_runs,
-            dry_run=dry_run,
+            dry_run=args.dry_run,
+            resume=args.resume,
         )
 
-    return run_combs(
-        serve_cmd=serve_cmd,
-        bench_cmd=bench_cmd,
-        after_bench_cmd=after_bench_cmd,
-        show_stdout=show_stdout,
-        serve_params=serve_params,
-        bench_params=bench_params,
-        output_dir=output_dir,
-        num_runs=num_runs,
-        dry_run=dry_run,
-    )
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser.add_argument(
+            "--serve-cmd",
+            type=str,
+            required=True,
+            help="The command used to run the server: `vllm serve ...`",
+        )
+        parser.add_argument(
+            "--bench-cmd",
+            type=str,
+            required=True,
+            help="The command used to run the benchmark: `vllm bench serve ...`",
+        )
+        parser.add_argument(
+            "--after-bench-cmd",
+            type=str,
+            default=None,
+            help="After a benchmark run is complete, invoke this command instead of "
+            "the default `ServerWrapper.clear_cache()`.",
+        )
+        parser.add_argument(
+            "--show-stdout",
+            action="store_true",
+            help="If set, logs the standard output of subcommands. "
+            "Useful for debugging but can be quite spammy.",
+        )
+        parser.add_argument(
+            "--serve-params",
+            type=str,
+            default=None,
+            help="Path to JSON file containing a list of parameter combinations "
+            "for the `vllm serve` command. "
+            "If both `serve_params` and `bench_params` are given, "
+            "this script will iterate over their Cartesian product.",
+        )
+        parser.add_argument(
+            "--bench-params",
+            type=str,
+            default=None,
+            help="Path to JSON file containing a list of parameter combinations "
+            "for the `vllm bench serve` command. "
+            "If both `serve_params` and `bench_params` are given, "
+            "this script will iterate over their Cartesian product.",
+        )
+        parser.add_argument(
+            "-o",
+            "--output-dir",
+            type=str,
+            default="results",
+            help="The directory to which results are written.",
+        )
+        parser.add_argument(
+            "--num-runs",
+            type=int,
+            default=3,
+            help="Number of runs per parameter combination.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="If set, prints the commands to run, "
+            "then exits without executing them.",
+        )
+        parser.add_argument(
+            "--resume",
+            type=str,
+            default=None,
+            help="Set this to the name of a directory under `output_dir` (which is a "
+            "timestamp) to resume a previous execution of this script, i.e., only run "
+            "parameter combinations for which there are still no output files.",
+        )
 
+        return parser
 
-def run_main(
-    serve_cmd: list[str],
-    bench_cmd: list[str],
-    after_bench_cmd: list[str],
-    *,
-    show_stdout: bool,
-    serve_params: ParameterSweep,
-    bench_params: ParameterSweep,
-    sla_params: SLASweep,
-    sla_variable: SLAVariable,
-    output_dir: Path,
-    num_runs: int,
-    dry_run: bool,
-    resume: str | None,
-):
-    timestamp = resume or datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_dir = output_dir / timestamp
 
-    if resume and not output_dir.exists():
+def run_main(args: SweepServeArgs):
+    timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = args.output_dir / timestamp
+
+    if args.resume and not output_dir.exists():
         raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
 
     try:
-        return _run_main(
-            serve_cmd=serve_cmd,
-            bench_cmd=bench_cmd,
-            after_bench_cmd=after_bench_cmd,
-            show_stdout=show_stdout,
-            serve_params=serve_params,
-            bench_params=bench_params,
-            sla_params=sla_params,
-            sla_variable=sla_variable,
+        return run_combs(
+            serve_cmd=args.serve_cmd,
+            bench_cmd=args.bench_cmd,
+            after_bench_cmd=args.after_bench_cmd,
+            show_stdout=args.show_stdout,
+            serve_params=args.serve_params,
+            bench_params=args.bench_params,
             output_dir=output_dir,
-            num_runs=num_runs,
-            dry_run=dry_run,
+            num_runs=args.num_runs,
+            dry_run=args.dry_run,
         )
     except BaseException as exc:
         raise RuntimeError(
@@ -682,144 +372,14 @@ def run_main(
         ) from exc
 
 
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run vLLM server benchmark under multiple settings."
-    )
-    parser.add_argument(
-        "--serve-cmd",
-        type=str,
-        required=True,
-        help="The command used to run the server: `vllm serve ...`",
-    )
-    parser.add_argument(
-        "--bench-cmd",
-        type=str,
-        required=True,
-        help="The command used to run the benchmark: `vllm bench serve ...`",
-    )
-    parser.add_argument(
-        "--after-bench-cmd",
-        type=str,
-        default=None,
-        help="After a benchmark run is complete, invoke this command instead of the "
-        "default `ServerWrapper.clear_cache()`.",
-    )
-    parser.add_argument(
-        "--show-stdout",
-        action="store_true",
-        help="If set, logs the standard output of subcommands. "
-        "Useful for debugging but can be quite spammy.",
-    )
-    parser.add_argument(
-        "--serve-params",
-        type=str,
-        default=None,
-        help="Path to JSON file containing a list of parameter combinations "
-        "for the `vllm serve` command. "
-        "If both `serve_params` and `bench_params` are given, "
-        "this script will iterate over their Cartesian product.",
-    )
-    parser.add_argument(
-        "--bench-params",
-        type=str,
-        default=None,
-        help="Path to JSON file containing a list of parameter combinations "
-        "for the `vllm bench serve` command. "
-        "If both `serve_params` and `bench_params` are given, "
-        "this script will iterate over their Cartesian product.",
-    )
-    parser.add_argument(
-        "--sla-params",
-        type=str,
-        default=None,
-        help="Path to JSON file containing a list of SLA constraints to satisfy. "
-        'Each constraint is expressed in `{"<KEY>": "<OP><VALUE>"}` format, '
-        'e.g.: `{"p99_e2el_ms": "<=500"}` means that '
-        "the E2E latency should be less than 500ms 99% of the time. "
-        "Setting this option runs this script in SLA mode, which searches for the "
-        "maximum `sla_variable` that satisfies the constraints for each combination "
-        "of `serve_params`, `bench_params`, and `sla_params`.",
-    )
-    parser.add_argument(
-        "--sla-variable",
-        type=str,
-        choices=get_args(SLAVariable),
-        default="request_rate",
-        help="Whether to tune request rate or maximum concurrency to satisfy "
-        "the SLA constraints.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output-dir",
-        type=str,
-        default="results",
-        help="The directory to which results are written.",
-    )
-    parser.add_argument(
-        "--num-runs",
-        type=int,
-        default=3,
-        help="Number of runs per parameter combination.",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="If set, prints the commands to run, then exits without executing them.",
-    )
-    parser.add_argument(
-        "--resume",
-        type=str,
-        default=None,
-        help="Set this to the name of a directory under `output_dir` (which is a "
-        "timestamp) to resume a previous execution of this script, i.e., only run "
-        "parameter combinations for which there are still no output files.",
-    )
-
-    args = parser.parse_args()
+def main(args: argparse.Namespace):
+    run_main(SweepServeArgs.from_cli_args(args))
 
-    serve_cmd = shlex.split(args.serve_cmd)
-    bench_cmd = shlex.split(args.bench_cmd)
-    after_bench_cmd = (
-        [] if args.after_bench_cmd is None else shlex.split(args.after_bench_cmd)
-    )
 
-    if args.serve_params:
-        serve_params = ParameterSweep.read_json(args.serve_params)
-    else:
-        # i.e.: run serve_cmd without any modification
-        serve_params = ParameterSweep.from_records([{}])
-
-    if args.bench_params:
-        bench_params = ParameterSweep.read_json(args.bench_params)
-    else:
-        # i.e.: run bench_cmd without any modification
-        bench_params = ParameterSweep.from_records([{}])
-
-    if args.sla_params:
-        sla_params = SLASweep.read_json(args.sla_params)
-    else:
-        sla_params = SLASweep.from_records([])
-
-    num_runs = args.num_runs
-    if num_runs < 1:
-        raise ValueError("`num_runs` should be at least 1.")
-
-    run_main(
-        serve_cmd=serve_cmd,
-        bench_cmd=bench_cmd,
-        after_bench_cmd=after_bench_cmd,
-        show_stdout=args.show_stdout,
-        serve_params=serve_params,
-        bench_params=bench_params,
-        sla_params=sla_params,
-        sla_variable=args.sla_variable,
-        output_dir=Path(args.output_dir),
-        num_runs=num_runs,
-        dry_run=args.dry_run,
-        resume=args.resume,
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run vLLM server benchmark under multiple settings."
     )
+    SweepServeArgs.add_cli_args(parser)
 
-
-if __name__ == "__main__":
-    main()
+    main(parser.parse_args())
diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py
new file mode 100644
index 000000000000..62e2917dc22b
--- /dev/null
+++ b/vllm/benchmarks/sweep/serve_sla.py
@@ -0,0 +1,483 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import contextlib
+import json
+import math
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Literal, get_args
+
+import pandas as pd
+from typing_extensions import assert_never
+
+from .param_sweep import ParameterSweep, ParameterSweepItem
+from .serve import SweepServeArgs, run_benchmark, run_server
+from .server import ServerProcess
+from .sla_sweep import SLASweep, SLASweepItem
+from .utils import sanitize_filename
+
+
+def _get_sla_base_path(
+    output_dir: Path,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+):
+    parts = list[str]()
+    if serve_comb:
+        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
+    if bench_comb:
+        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
+
+    return output_dir / sanitize_filename("-".join(parts))
+
+
+def _get_sla_iter_path(
+    base_path: Path,
+    sla_comb: SLASweepItem,
+    sla_variable: str,
+    sla_value: int | None,
+):
+    if sla_value is None:
+        prefix = sla_comb.as_text(sep="-")
+        return base_path / f"SLA--{prefix}.json"
+
+    return base_path / f"{sla_variable}={sla_value}"
+
+
+def _get_sla_run_path(iter_path: Path, run_number: int | None):
+    if run_number is None:
+        return iter_path / "summary.json"
+
+    return iter_path / f"run={run_number}.json"
+
+
+def _sla_needs_server(
+    serve_comb: ParameterSweepItem,
+    bench_combs: ParameterSweep,
+    sla_combs: SLASweep,
+    sla_variable: str,
+    output_dir: Path,
+):
+    for bench_comb in bench_combs:
+        base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
+        for sla_comb in sla_combs:
+            if not _get_sla_iter_path(
+                base_path,
+                sla_comb,
+                sla_variable,
+                sla_value=None,
+            ).exists():
+                return True
+
+    return False
+
+
+def run_sla(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    iter_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    iter_data = list[dict[str, object]]()
+
+    for run_number in range(num_runs):
+        run_data = run_benchmark(
+            server,
+            bench_cmd,
+            serve_overrides=serve_comb,
+            bench_overrides=bench_comb,
+            run_number=run_number,
+            output_path=_get_sla_run_path(iter_path, run_number),
+            dry_run=dry_run,
+        )
+
+        if run_data is not None:
+            iter_data.append(run_data)
+
+    if dry_run:
+        return None
+
+    with _get_sla_run_path(iter_path, run_number=None).open("w") as f:
+        json.dump(iter_data, f, indent=4)
+
+    return iter_data
+
+
+SLAVariable = Literal["request_rate", "max_concurrency"]
+
+
+def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable):
+    request_throughput = float(run_data["request_throughput"])  # type: ignore
+    if sla_variable == "request_rate":
+        return request_throughput
+    if sla_variable == "max_concurrency":
+        mean_latency_ms = float(run_data["mean_e2el_ms"])  # type: ignore
+        return request_throughput * mean_latency_ms / 1000
+
+    assert_never(sla_variable)
+
+
+def _estimate_sla_bounds(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+    sla_variable: SLAVariable,
+    init_value: int,
+    max_value: int,
+):
+    sla_data = list[dict[str, object]]()
+
+    max_passing: int = 0
+    min_failing: int = 0
+
+    val: int = init_value
+    assert val > 0
+
+    while True:
+        print(f"Testing {sla_variable}: {val} req/s")
+
+        iter_data = run_sla(
+            server,
+            bench_cmd,
+            serve_comb=serve_comb,
+            bench_comb=bench_comb | {sla_variable: val},
+            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+
+        assert iter_data is not None
+        sla_data.extend(iter_data)
+
+        iter_data_mean = {
+            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
+            for k in sla_comb
+        }
+
+        sla_results = [
+            criterion.print_and_validate(iter_data_mean, k)
+            for k, criterion in sla_comb.items()
+        ]
+
+        if all(sla_results):
+            print("SLA criteria are met.")
+            max_passing = val
+            val *= 2
+        else:
+            print("SLA criteria are not met.")
+            min_failing = val
+            break
+
+        if val >= max_value:
+            break
+
+    return sla_data, (max_passing, min_failing)
+
+
+def _find_sla_value(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+    sla_variable: SLAVariable,
+    min_value: int,
+    max_value: int,
+):
+    sla_data = list[dict[str, object]]()
+
+    left: int = min_value
+    right: int = max_value
+
+    while True:
+        val = (left + right) // 2
+        print(f"Testing {sla_variable}: {val} req/s")
+
+        iter_data = run_sla(
+            server,
+            bench_cmd,
+            serve_comb=serve_comb,
+            bench_comb=bench_comb | {sla_variable: val},
+            iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, val),
+            num_runs=num_runs,
+            dry_run=dry_run,
+        )
+
+        assert iter_data is not None
+        sla_data.extend(iter_data)
+
+        iter_data_mean = {
+            k: sum(float(run_data[k]) for run_data in iter_data) / len(iter_data)  # type: ignore
+            for k in sla_comb
+        }
+
+        sla_results = [
+            criterion.print_and_validate(iter_data_mean, k)
+            for k, criterion in sla_comb.items()
+        ]
+
+        if all(sla_results):
+            print("SLA criteria are met.")
+            left = val
+        else:
+            print("SLA criteria are not met.")
+            right = val
+
+        if right - left <= 1:
+            break
+
+    return sla_data, left
+
+
+def search_sla(
+    server: ServerProcess | None,
+    bench_cmd: list[str],
+    *,
+    serve_comb: ParameterSweepItem,
+    bench_comb: ParameterSweepItem,
+    sla_comb: SLASweepItem,
+    sla_variable: SLAVariable,
+    sla_inf_value: int = 65536,  # The value that represents infinite QPS
+    base_path: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    print("[SLA START]")
+    print(f"SLA criteria: {sla_comb.as_text()}")
+
+    sla_data_0 = run_sla(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb | {sla_variable: sla_inf_value},
+        iter_path=_get_sla_iter_path(base_path, sla_comb, sla_variable, sla_inf_value),
+        num_runs=num_runs,
+        dry_run=dry_run,
+    )
+    if sla_data_0 is None:
+        assert dry_run
+        print("Omitting SLA search.")
+        print("[SLA END]")
+        return None
+
+    sla_init_value = math.ceil(
+        sum(_estimate_sla_value(item, sla_variable) for item in sla_data_0)
+        / len(sla_data_0)
+    )
+    print(f"Initial {sla_variable} to search: {sla_init_value} req/s.")
+
+    sla_data_1, (sla_min, sla_max) = _estimate_sla_bounds(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb,
+        sla_comb=sla_comb,
+        base_path=base_path,
+        num_runs=num_runs,
+        dry_run=dry_run,
+        sla_variable=sla_variable,
+        init_value=sla_init_value,
+        max_value=sla_inf_value,
+    )
+    print(f"Range of {sla_variable} to search: [{sla_min}, {sla_max}] req/s.")
+
+    sla_data_2, sla_value = _find_sla_value(
+        server,
+        bench_cmd,
+        serve_comb=serve_comb,
+        bench_comb=bench_comb,
+        sla_comb=sla_comb,
+        base_path=base_path,
+        num_runs=num_runs,
+        dry_run=dry_run,
+        sla_variable=sla_variable,
+        min_value=sla_min,
+        max_value=sla_max,
+    )
+
+    sla_data = sla_data_0 + sla_data_1 + sla_data_2
+    print(f"Maximum {sla_variable} for SLA: {sla_value} req/s.")
+
+    with _get_sla_iter_path(
+        base_path,
+        sla_comb,
+        sla_variable,
+        sla_value=None,
+    ).open("w") as f:
+        json.dump(sla_data, f, indent=4)
+
+    print("[SLA END]")
+
+    return sla_data
+
+
+def run_slas(
+    serve_cmd: list[str],
+    bench_cmd: list[str],
+    after_bench_cmd: list[str],
+    *,
+    show_stdout: bool,
+    serve_params: ParameterSweep,
+    bench_params: ParameterSweep,
+    sla_params: SLASweep,
+    sla_variable: SLAVariable,
+    output_dir: Path,
+    num_runs: int,
+    dry_run: bool,
+):
+    if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params):
+        raise ValueError(
+            f"You should not override `{sla_variable}` in `bench_params` in SLA mode, "
+            "since it is supposed to be determined automatically."
+        )
+
+    all_data = list[dict[str, object]]()
+    for serve_comb in serve_params:
+        with (
+            run_server(
+                serve_cmd,
+                after_bench_cmd,
+                show_stdout=show_stdout,
+                serve_overrides=serve_comb,
+                dry_run=dry_run,
+            )
+            if _sla_needs_server(
+                serve_comb,
+                bench_params,
+                sla_params,
+                sla_variable,
+                output_dir,
+            )
+            else contextlib.nullcontext()
+        ) as server:
+            for bench_comb in bench_params:
+                for sla_comb in sla_params:
+                    base_path = _get_sla_base_path(output_dir, serve_comb, bench_comb)
+
+                    comb_data = search_sla(
+                        server,
+                        bench_cmd,
+                        serve_comb=serve_comb,
+                        bench_comb=bench_comb,
+                        sla_comb=sla_comb,
+                        sla_variable=sla_variable,
+                        base_path=base_path,
+                        num_runs=num_runs,
+                        dry_run=dry_run,
+                    )
+
+                    if comb_data is not None:
+                        all_data.extend(comb_data)
+
+    if dry_run:
+        return None
+
+    combined_df = pd.DataFrame.from_records(all_data)
+    combined_df.to_csv(output_dir / "summary.csv")
+
+    return combined_df
+
+
+@dataclass
+class SweepServeSLAArgs(SweepServeArgs):
+    sla_params: SLASweep
+    sla_variable: SLAVariable
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        base_args = super().from_cli_args(args)
+
+        if args.sla_params:
+            sla_params = SLASweep.read_json(args.sla_params)
+        else:
+            sla_params = SLASweep.from_records([])
+
+        return cls(
+            **asdict(base_args),
+            sla_params=sla_params,
+            sla_variable=args.sla_variable,
+        )
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser = super().add_cli_args(parser)
+
+        parser.add_argument(
+            "--sla-params",
+            type=str,
+            required=True,
+            help="Path to JSON file containing a list of SLA constraints to satisfy. "
+            'Each constraint is expressed in `{"<KEY>": "<OP><VALUE>"}` format, '
+            'e.g.: `{"p99_e2el_ms": "<=500"}` means that '
+            "the E2E latency should be less than 500ms 99%% of the time. "
+            "Setting this option runs this script in SLA mode, which searches for "
+            "the maximum `sla_variable` that satisfies the constraints for "
+            "each combination of `serve_params`, `bench_params`, and `sla_params`.",
+        )
+        parser.add_argument(
+            "--sla-variable",
+            type=str,
+            choices=get_args(SLAVariable),
+            default="request_rate",
+            help="Whether to tune request rate or maximum concurrency to satisfy "
+            "the SLA constraints.",
+        )
+
+        return parser
+
+
+def run_main(args: SweepServeSLAArgs):
+    timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = args.output_dir / timestamp
+
+    if args.resume and not output_dir.exists():
+        raise ValueError(f"Cannot resume from non-existent directory ({output_dir})")
+
+    try:
+        return run_slas(
+            serve_cmd=args.serve_cmd,
+            bench_cmd=args.bench_cmd,
+            after_bench_cmd=args.after_bench_cmd,
+            show_stdout=args.show_stdout,
+            serve_params=args.serve_params,
+            bench_params=args.bench_params,
+            sla_params=args.sla_params,
+            sla_variable=args.sla_variable,
+            output_dir=output_dir,
+            num_runs=args.num_runs,
+            dry_run=args.dry_run,
+        )
+    except BaseException as exc:
+        raise RuntimeError(
+            f"The script was terminated early. Use `--resume {timestamp}` "
+            f"to continue the script from its last checkpoint."
+        ) from exc
+
+
+def main(args: argparse.Namespace):
+    run_main(SweepServeSLAArgs.from_cli_args(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Tune a variable to meet SLAs under multiple settings."
+    )
+    SweepServeSLAArgs.add_cli_args(parser)
+
+    main(parser.parse_args())

From 3fa0d4c10d91683813bdd7aa4776834410a292a1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 01:51:06 +0000
Subject: [PATCH 40/48] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py      | 15 +++++++++++----
 vllm/benchmarks/sweep/sla_sweep.py |  8 ++++----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 8cdbe5980e8f..fc8098d67a2c 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -43,13 +43,19 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         raise NotImplementedError
 
 
+class PlotEqualTo(PlotFilterBase):
+    @override
+    def apply(self, df: pd.DataFrame) -> pd.DataFrame:
+        return df[df[self.var] == self.target]
+
+
 class PlotLessThan(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] < self.target]
 
 
-class PlotLessThanOrEqual(PlotFilterBase):
+class PlotLessThanOrEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] <= self.target]
@@ -61,7 +67,7 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] > self.target]
 
 
-class PlotGreaterThanOrEqual(PlotFilterBase):
+class PlotGreaterThanOrEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] >= self.target]
@@ -69,8 +75,9 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
 
 # NOTE: The ordering is important! Match longer op_keys first
 PLOT_FILTERS: dict[str, type[PlotFilterBase]] = {
-    "<=": PlotLessThanOrEqual,
-    ">=": PlotGreaterThanOrEqual,
+    "==": PlotEqualTo,
+    "<=": PlotLessThanOrEqualTo,
+    ">=": PlotGreaterThanOrEqualTo,
     "<": PlotLessThan,
     ">": PlotGreaterThan,
 }
diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py
index 6a58b7149a02..a15e165b894f 100644
--- a/vllm/benchmarks/sweep/sla_sweep.py
+++ b/vllm/benchmarks/sweep/sla_sweep.py
@@ -46,7 +46,7 @@ def format_cond(self, lhs: str) -> str:
         return f"{lhs}<{self.target:.2f}"
 
 
-class SLALessThanOrEqual(SLACriterionBase):
+class SLALessThanOrEqualTo(SLACriterionBase):
     @override
     def validate(self, actual: float) -> bool:
         return actual <= self.target
@@ -66,7 +66,7 @@ def format_cond(self, lhs: str) -> str:
         return f"{lhs}>{self.target:.2f}"
 
 
-class SLAGreaterThanOrEqual(SLACriterionBase):
+class SLAGreaterThanOrEqualTo(SLACriterionBase):
     @override
     def validate(self, actual: float) -> bool:
         return actual >= self.target
@@ -78,8 +78,8 @@ def format_cond(self, lhs: str) -> str:
 
 # NOTE: The ordering is important! Match longer op_keys first
 SLA_CRITERIA: dict[str, type[SLACriterionBase]] = {
-    "<=": SLALessThanOrEqual,
-    ">=": SLAGreaterThanOrEqual,
+    "<=": SLALessThanOrEqualTo,
+    ">=": SLAGreaterThanOrEqualTo,
     "<": SLALessThan,
     ">": SLAGreaterThan,
 }

From a3d10958e2e8c9287608c40e6e22f8833011c5aa Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 02:27:31 +0000
Subject: [PATCH 41/48] Improve error message

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index fc8098d67a2c..ba005723e1da 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -235,6 +235,24 @@ def _plot_fig(
             f"Cannot find {var_y=!r} in parameter sweep results. "
             f"Available variables: {df.columns.tolist()}"
         )
+    for k in row_by:
+        if k not in df.columns:
+            raise ValueError(
+                f"Cannot find row_by={k!r} in parameter sweep results. "
+                f"Available variables: {df.columns.tolist()}"
+            )
+    for k in col_by:
+        if k not in df.columns:
+            raise ValueError(
+                f"Cannot find col_by={k!r} in parameter sweep results. "
+                f"Available variables: {df.columns.tolist()}"
+            )
+    for k in curve_by:
+        if k not in df.columns:
+            raise ValueError(
+                f"Cannot find curve_by={k!r} in parameter sweep results. "
+                f"Available variables: {df.columns.tolist()}"
+            )
 
     df = filter_by.apply(df)
     df = bin_by.apply(df)

From a4adbda5eaf42c99cc3c3aca4cab34fdb3ed635d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 02:51:06 +0000
Subject: [PATCH 42/48] Allow strings

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index ba005723e1da..8fcde751f9d3 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -24,14 +24,14 @@ def parse_str(cls, s: str):
         for op_key in PLOT_FILTERS:
             if op_key in s:
                 key, value = s.split(op_key)
-                return PLOT_FILTERS[op_key](key, float(value.removeprefix(op_key)))
+                return PLOT_FILTERS[op_key](key, value.removeprefix(op_key))
         else:
             raise ValueError(
                 f"Invalid operator for plot filter '{s}'. "
                 f"Valid operators are: {set(PLOT_FILTERS)}",
             )
 
-    def __init__(self, var: str, target: float) -> None:
+    def __init__(self, var: str, target: str) -> None:
         super().__init__()
 
         self.var = var
@@ -52,25 +52,25 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
 class PlotLessThan(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
-        return df[df[self.var] < self.target]
+        return df[df[self.var] < float(self.target)]
 
 
 class PlotLessThanOrEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
-        return df[df[self.var] <= self.target]
+        return df[df[self.var] <= float(self.target)]
 
 
 class PlotGreaterThan(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
-        return df[df[self.var] > self.target]
+        return df[df[self.var] > float(self.target)]
 
 
 class PlotGreaterThanOrEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
-        return df[df[self.var] >= self.target]
+        return df[df[self.var] >= float(self.target)]
 
 
 # NOTE: The ordering is important! Match longer op_keys first

From 6357b84d3d8f77bf620c2ef2349ef6debc1942b1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 02:57:15 +0000
Subject: [PATCH 43/48] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 8fcde751f9d3..d483a2272dab 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -46,7 +46,12 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
 class PlotEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
-        return df[df[self.var] == self.target]
+        try:
+            target = float(self.target)
+        except ValueError:
+            target = self.target
+
+        return df[df[self.var] == target]
 
 
 class PlotLessThan(PlotFilterBase):

From f750fc5a4acf6a9718cf70f5ca936683e809f402 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 03:01:42 +0000
Subject: [PATCH 44/48] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py      | 31 +++++++++++++++++-------------
 vllm/benchmarks/sweep/sla_sweep.py | 11 +++++++----
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index d483a2272dab..7390440075e6 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -4,6 +4,7 @@
 import json
 from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
 from functools import partial
 from pathlib import Path
 from types import TracebackType
@@ -18,31 +19,33 @@
 from .utils import sanitize_filename
 
 
+@dataclass
 class PlotFilterBase(ABC):
+    var: str
+    target: str
+
     @classmethod
     def parse_str(cls, s: str):
         for op_key in PLOT_FILTERS:
             if op_key in s:
                 key, value = s.split(op_key)
-                return PLOT_FILTERS[op_key](key, value.removeprefix(op_key))
+                return PLOT_FILTERS[op_key](
+                    key,
+                    value.removeprefix(op_key).strip("'").strip('"'),
+                )
         else:
             raise ValueError(
                 f"Invalid operator for plot filter '{s}'. "
                 f"Valid operators are: {set(PLOT_FILTERS)}",
             )
 
-    def __init__(self, var: str, target: str) -> None:
-        super().__init__()
-
-        self.var = var
-        self.target = target
-
     @abstractmethod
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         """Applies this filter to a DataFrame."""
         raise NotImplementedError
 
 
+@dataclass
 class PlotEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -54,24 +57,28 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] == target]
 
 
+@dataclass
 class PlotLessThan(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] < float(self.target)]
 
 
+@dataclass
 class PlotLessThanOrEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] <= float(self.target)]
 
 
+@dataclass
 class PlotGreaterThan(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df[df[self.var] > float(self.target)]
 
 
+@dataclass
 class PlotGreaterThanOrEqualTo(PlotFilterBase):
     @override
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -103,7 +110,11 @@ def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
 
+@dataclass
 class PlotBinner:
+    var: str
+    bin_size: float
+
     @classmethod
     def parse_str(cls, s: str):
         for op_key in PLOT_BINNERS:
@@ -116,12 +127,6 @@ def parse_str(cls, s: str):
                 f"Valid operators are: {set(PLOT_BINNERS)}",
             )
 
-    def __init__(self, var: str, bin_size: float) -> None:
-        super().__init__()
-
-        self.var = var
-        self.bin_size = bin_size
-
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
         """Applies this binner to a DataFrame."""
         df = df.copy()
diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py
index a15e165b894f..ba7d78802e9d 100644
--- a/vllm/benchmarks/sweep/sla_sweep.py
+++ b/vllm/benchmarks/sweep/sla_sweep.py
@@ -3,15 +3,14 @@
 import json
 import os
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 
 from typing_extensions import override
 
 
+@dataclass
 class SLACriterionBase(ABC):
-    def __init__(self, target: float) -> None:
-        super().__init__()
-
-        self.target = target
+    target: float
 
     @abstractmethod
     def validate(self, actual: float) -> bool:
@@ -36,6 +35,7 @@ def print_and_validate(
         return result
 
 
+@dataclass
 class SLALessThan(SLACriterionBase):
     @override
     def validate(self, actual: float) -> bool:
@@ -46,6 +46,7 @@ def format_cond(self, lhs: str) -> str:
         return f"{lhs}<{self.target:.2f}"
 
 
+@dataclass
 class SLALessThanOrEqualTo(SLACriterionBase):
     @override
     def validate(self, actual: float) -> bool:
@@ -56,6 +57,7 @@ def format_cond(self, lhs: str) -> str:
         return f"{lhs}<={self.target:.2f}"
 
 
+@dataclass
 class SLAGreaterThan(SLACriterionBase):
     @override
     def validate(self, actual: float) -> bool:
@@ -66,6 +68,7 @@ def format_cond(self, lhs: str) -> str:
         return f"{lhs}>{self.target:.2f}"
 
 
+@dataclass
 class SLAGreaterThanOrEqualTo(SLACriterionBase):
     @override
     def validate(self, actual: float) -> bool:

From 2d856ff3a83c81b5ec00bca3157eb9e7eb2fd9b7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 03:08:09 +0000
Subject: [PATCH 45/48] Ordering

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/plot.py      | 4 ++--
 vllm/benchmarks/sweep/sla_sweep.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py
index 7390440075e6..92485c09b416 100644
--- a/vllm/benchmarks/sweep/plot.py
+++ b/vllm/benchmarks/sweep/plot.py
@@ -36,7 +36,7 @@ def parse_str(cls, s: str):
         else:
             raise ValueError(
                 f"Invalid operator for plot filter '{s}'. "
-                f"Valid operators are: {set(PLOT_FILTERS)}",
+                f"Valid operators are: {sorted(PLOT_FILTERS)}",
             )
 
     @abstractmethod
@@ -124,7 +124,7 @@ def parse_str(cls, s: str):
         else:
             raise ValueError(
                 f"Invalid operator for plot binner '{s}'. "
-                f"Valid operators are: {set(PLOT_BINNERS)}",
+                f"Valid operators are: {sorted(PLOT_BINNERS)}",
             )
 
     def apply(self, df: pd.DataFrame) -> pd.DataFrame:
diff --git a/vllm/benchmarks/sweep/sla_sweep.py b/vllm/benchmarks/sweep/sla_sweep.py
index ba7d78802e9d..327e3c7c5897 100644
--- a/vllm/benchmarks/sweep/sla_sweep.py
+++ b/vllm/benchmarks/sweep/sla_sweep.py
@@ -123,7 +123,7 @@ def from_record(cls, record: dict[str, str]):
                 raise ValueError(
                     f"Invalid operator for "
                     f"SLA constraint '{metric_key}={metric_value}'. "
-                    f"Valid operators are: {set(SLA_CRITERIA)}",
+                    f"Valid operators are: {sorted(SLA_CRITERIA)}",
                 )
 
         return cls(sla_criteria)

From e6d4c7294c53a686c51163516a04d7a428241227 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 13:27:58 +0000
Subject: [PATCH 46/48] Don't split

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/serve.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index e99052247e71..c4d271a0e4d9 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -108,9 +108,9 @@ def _get_comb_base_path(
 ):
     parts = list[str]()
     if serve_comb:
-        parts.extend(("SERVE-", serve_comb.as_text(sep="-").split("-")))
+        parts.extend(("SERVE-", serve_comb.as_text(sep="-")))
     if bench_comb:
-        parts.extend(("BENCH-", bench_comb.as_text(sep="-").split("-")))
+        parts.extend(("BENCH-", bench_comb.as_text(sep="-")))
 
     return output_dir / sanitize_filename("-".join(parts))
 

From 46d9f19ab2ebc864d8a6ff22444214dd640170b0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 21 Oct 2025 13:28:51 +0000
Subject: [PATCH 47/48] Remove unnecessary quotes

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/benchmarks/sweep/utils.py b/vllm/benchmarks/sweep/utils.py
index 5a9e7d932b59..49d7867eaf48 100644
--- a/vllm/benchmarks/sweep/utils.py
+++ b/vllm/benchmarks/sweep/utils.py
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 def sanitize_filename(filename: str) -> str:
-    return filename.replace("/", "_").replace("..", "__")
+    return filename.replace("/", "_").replace("..", "__").strip("'").strip('"')

From ceabbc81fd265f867a212c788f4f9757a737c005 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 22 Oct 2025 03:29:44 +0000
Subject: [PATCH 48/48] Update with benchmark overrides as well

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/sweep/serve.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index c4d271a0e4d9..6e408dac0b49 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -41,6 +41,19 @@ def run_server(
     print("[END SERVER]")
 
 
+def _update_run_data(
+    run_data: dict[str, object],
+    serve_overrides: ParameterSweepItem,
+    bench_overrides: ParameterSweepItem,
+    run_number: int,
+):
+    run_data["run_number"] = run_number
+    run_data.update(serve_overrides)
+    run_data.update(bench_overrides)
+
+    return run_data
+
+
 def run_benchmark(
     server: ServerProcess | None,
     bench_cmd: list[str],
@@ -73,7 +86,12 @@ def run_benchmark(
 
         with output_path.open("rb") as f:
             run_data = json.load(f)
-            return run_data
+            return _update_run_data(
+                run_data,
+                serve_overrides,
+                bench_overrides,
+                run_number,
+            )
 
     if server is None:
         if not dry_run:
@@ -90,8 +108,12 @@ def run_benchmark(
     with output_path.open("rb") as f:
         run_data = json.load(f)
 
-    run_data["run_number"] = run_number
-    run_data.update(serve_overrides)
+    run_data = _update_run_data(
+        run_data,
+        serve_overrides,
+        bench_overrides,
+        run_number,
+    )
 
     with output_path.open("w") as f:
         json.dump(run_data, f, indent=4)