[Misc]: Add support for goodput on guided benchmarking + TPOT calculation refactor (#13736)

b8zhong · web-flow · commit ec8a5e538638 · 2025-02-26T19:06:47.000+08:00
Signed-off-by: Brayden Zhong &lt;b8zhong@uwaterloo.ca&gt;
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
@@ -9,7 +9,7 @@
     ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 
 On the client side, run:
-    python benchmarks/benchmark_serving.py \
+    python benchmarks/benchmark_serving_guided.py \
         --backend <backend> \
         --model <your_model> \
         --dataset json \
@@ -31,7 +31,7 @@
 import time
 import warnings
 from dataclasses import dataclass
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import AsyncGenerator, Dict, List, Optional, Tuple
 
 import datasets
 import numpy as np
@@ -264,6 +264,7 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentile_metrics: List[str],
     selected_percentiles: List[float],
+    goodput_config_dict: Optional[Dict[str, float]] = None,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
     actual_output_lens: List[int] = []
     total_input = 0
@@ -287,10 +288,10 @@ def calculate_metrics(
             total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
-                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
-                                                                 1)
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
                 tpots.append(tpot)
-            outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
+            outputs[i].tpot = tpot
             # Note: if output_len <= 1, we regard tpot as 0 for goodput
             all_tpots.append(tpot)
             itls += outputs[i].itl
@@ -300,6 +301,28 @@ def calculate_metrics(
         else:
             actual_output_lens.append(0)
 
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(goodput_config_dict["ttft"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(goodput_config_dict["tpot"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(goodput_config_dict["e2el"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
     if completed == 0:
         warnings.warn(
             "All requests failed. This is likely due to a misconfiguration "
@@ -356,6 +379,7 @@ async def benchmark(
     max_concurrency: Optional[int],
     guided_decoding_ratio: float,
     guided_decoding_backend: str,
+    goodput_config_dict: Optional[Dict[str, float]] = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -483,6 +507,7 @@ async def limited_request_func(request_func_input, pbar):
         tokenizer=tokenizer,
         selected_percentile_metrics=selected_percentile_metrics,
         selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
     )
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -494,6 +519,9 @@ async def limited_request_func(request_func_input, pbar):
                                  metrics.total_output))
     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                     metrics.request_throughput))
+    if goodput_config_dict:
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+                                        metrics.request_goodput))
     print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                     metrics.output_throughput))
     print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@@ -617,6 +645,40 @@ def _eval_correctness(expected, actual):
             100) if len(not_none_scores) > 0 else None
 
 
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds.") from err
+    return goodput_config_dict
+
+
+def check_goodput_args(args):
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. ")
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative.")
+    return goodput_config_dict
+
+
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
@@ -661,6 +723,8 @@ def main(args: argparse.Namespace):
 
     input_requests = sample_requests(tokenizer, args)
 
+    goodput_config_dict = check_goodput_args(args)
+
     benchmark_result, ret = asyncio.run(
         benchmark(
             backend=backend,
@@ -681,6 +745,7 @@ def main(args: argparse.Namespace):
             max_concurrency=args.max_concurrency,
             guided_decoding_ratio=args.guided_decoding_ratio,
             guided_decoding_backend=args.guided_decoding_backend,
+            goodput_config_dict=goodput_config_dict,
         ))
 
     # Save config and results to json
@@ -865,6 +930,18 @@ def main(args: argparse.Namespace):
         "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
     )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        "pairs, where the key is a metric name, and the value is in "
+        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        "separated by spaces. Allowed request level metric names are "
+        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+
     parser.add_argument("--no-guided-decoding",
                         action='store_true',
                         default=False,