ai-dynamo
diff --git a/‎benchmarks/profiler/profile_endpoint.py‎
Lines changed: 100 additions & 0 deletions b/‎benchmarks/profiler/profile_endpoint.py‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎benchmarks/profiler/profile_sla.py‎
Lines changed: 22 additions & 104 deletions b/‎benchmarks/profiler/profile_sla.py‎
Lines changed: 22 additions & 104 deletions
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import logging
+import os
+
+from utils.profile_prefill import profile_prefill
+
+from benchmarks.profiler.utils.profile_decode import profile_decode
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
+)
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="profile a given endpoint's performance for prefill or decode"
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        required=True,
+        choices=["prefill", "decode"],
+        help="mode to profile",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="model name",
+    )
+    parser.add_argument(
+        "--url",
+        type=str,
+        required=True,
+        help="base url of the endpoint",
+    )
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        required=True,
+        help="number of gpus",
+    )
+    parser.add_argument(
+        "--max_kv_tokens",
+        type=int,
+        required=False,
+        default=0,
+        help="max kv tokens of the endpoint (only used for decode)",
+    )
+    parser.add_argument(
+        "--work_dir",
+        type=str,
+        default="endpoint_profiling_results/",
+        help="work directory to save the results",
+    )
+    parser.add_argument(
+        "--max_context_length",
+        type=int,
+        default=16384,
+        help="max context length of the endpoint",
+    )
+    parser.add_argument(
+        "--interpolation_granularity",
+        type=int,
+        default=8,
+        help="interpolation granularity for the results",
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.work_dir, exist_ok=True)
+    if args.mode == "prefill":
+        profile_prefill(
+            args.work_dir,
+            args.model_name,
+            args.url,
+            args.num_gpus,
+            args.max_context_length,
+            args.interpolation_granularity,
+        )
+    elif args.mode == "decode":
+        assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode"
+        profile_decode(
+            args.work_dir,
+            args.model_name,
+            args.url,
+            args.num_gpus,
+            args.max_kv_tokens,
+            args.max_context_length,
+            args.interpolation_granularity,
+        )
+    else:
+        raise ValueError(f"Invalid mode: {args.mode}")
@@ -28,18 +28,16 @@
     cleanup_remaining_deployments,
 )
 from utils.genai_perf import benchmark_decode, benchmark_prefill
-from utils.plot import (
-    plot_decode_3d_surface,
-    plot_decode_performance,
-    plot_prefill_interpolation,
-    plot_prefill_performance,
-)
+from utils.plot import plot_decode_performance, plot_prefill_performance
 from utils.profile_cache import (
     check_decode_results_exist,
     check_prefill_results_exist,
     load_existing_decode_results,
     load_existing_prefill_results,
 )
+from utils.profile_prefill import profile_prefill
+
+from benchmarks.profiler.utils.profile_decode import profile_decode
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -373,9 +371,6 @@ async def run_profile(args):
 
         # interpolate ISL - TTFT with best prefill TP
         best_prefill_tp = prefill_tp_size[selected_prefill_idx]
-        prefill_isl = []
-        prefill_ttft = []
-        prefill_thpt_per_gpu = []
         logger.info(
             f"Profiling prefill under best TP {best_prefill_tp} with different ISL..."
         )
@@ -420,58 +415,22 @@ async def run_profile(args):
             )
 
         base_url = client.get_service_url()
-        for isl in range(
-            100,
+
+        profile_prefill(
+            work_dir,
+            model_name,
+            base_url,
+            best_prefill_tp,
             args.max_context_length,
-            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-        ):
-            # run genai-perf
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-            gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, base_url=base_url
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_isl.append(isl)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
+            args.prefill_interpolation_granularity,
+        )
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
         deployment_clients.remove(client)
         print("Deployment deleted")
 
-        # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
-        if len(prefill_isl) > 2:
-            logger.info("Interpolating prefill TTFT and throughput vs ISL...")
-
-            # Convert to numpy arrays for easier manipulation
-            prefill_isl_np = np.array(prefill_isl)
-            prefill_ttft_np = np.array(prefill_ttft)
-            prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
-
-            save_path = f"{work_dir}/raw_data.npz"
-            np.savez(
-                save_path,
-                prefill_isl=prefill_isl_np,
-                prefill_ttft=prefill_ttft_np,
-                prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
-            )
-
-            # Call the plotting function
-            plot_prefill_interpolation(
-                prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
-            )
-        else:
-            logger.warning(
-                "Not enough data points to perform interpolation (need at least 3 points)"
-            )
-
         # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
-        x_kv_usage = []
-        y_context_length = []
-        z_itl = []
-        z_thpt_per_gpu = []
         best_decode_tp = decode_tp_size[selected_decode_idx]
         logger.info(f"Profiling decode with TP size {best_decode_tp}...")
         decode_config = config_modifier.set_config_tp_size(
@@ -508,64 +467,23 @@ async def run_profile(args):
             f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
         )
 
-        osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
         base_url = client.get_service_url()
-        for isl in range(
-            100,
-            args.max_context_length - osl,
-            (args.max_context_length - osl) // args.decode_interpolation_granularity,
-        ):
-            max_concurrency = max_kv_tokens // (isl + osl)
-            sweep_num_request = list(
-                range(
-                    1,
-                    max_concurrency,
-                    max_concurrency // args.decode_interpolation_granularity,
-                )
-            )
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = (
-                    f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-                )
-                gap_result = benchmark_decode(
-                    isl,
-                    osl,
-                    num_request,
-                    genai_perf_artifact_dir,
-                    model_name,
-                    base_url=base_url,
-                )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                    y_context_length.append(isl + osl / 2)
-                    z_itl.append(itl)
-                    z_thpt_per_gpu.append(
-                        gap_result["output_token_throughput"]["avg"] / best_decode_tp
-                    )
+
+        profile_decode(
+            work_dir,
+            model_name,
+            base_url,
+            best_decode_tp,
+            max_kv_tokens,
+            args.max_context_length,
+            args.decode_interpolation_granularity,
+        )
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
         deployment_clients.remove(client)
         print("Deployment deleted")
 
-        # Save the data points to a .npz file
-        save_path = f"{work_dir}/raw_data.npz"
-        np.savez(
-            save_path,
-            x_kv_usage=np.array(x_kv_usage),
-            y_context_length=np.array(y_context_length),
-            z_itl=np.array(z_itl),
-            z_thpt_per_gpu=np.array(z_thpt_per_gpu),
-            max_kv_tokens=np.array([max_kv_tokens]),
-        )
-        logger.info(f"Saved data points to {save_path}")
-
-        # Plot 3D surface
-        plot_decode_3d_surface(
-            x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
-        )
-
     except Exception as e:
         logger.error(f"Profile job failed with error: {e}")
         raise