pc

tedzhouhk · tedzhouhk · commit 80c8d0a5dd62 · 2025-08-08T19:54:15.000-07:00
diff --git a/benchmarks/profiler/profile_endpoint.py b/benchmarks/profiler/profile_endpoint.py
@@ -4,8 +4,8 @@
 import argparse
 import logging
 
-from utils.profile_prefill import profile_prefill
 from utils.profile_deocde import profile_decode
+from utils.profile_prefill import profile_prefill
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -72,11 +72,26 @@
         help="interpolation granularity for the results",
     )
     args = parser.parse_args()
-    
+
     if args.mode == "prefill":
-        profile_prefill(args.work_dir, args.model_name, args.url, args.num_gpus, args.max_context_length, args.interpolation_granularity)
+        profile_prefill(
+            args.work_dir,
+            args.model_name,
+            args.url,
+            args.num_gpus,
+            args.max_context_length,
+            args.interpolation_granularity,
+        )
     elif args.mode == "decode":
         assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode"
-        profile_decode(args.work_dir, args.model_name, args.url, args.num_gpus, args.max_kv_tokens, args.max_context_length, args.interpolation_granularity)
+        profile_decode(
+            args.work_dir,
+            args.model_name,
+            args.url,
+            args.num_gpus,
+            args.max_kv_tokens,
+            args.max_context_length,
+            args.interpolation_granularity,
+        )
     else:
-        raise ValueError(f"Invalid mode: {args.mode}")
+        raise ValueError(f"Invalid mode: {args.mode}")
diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
@@ -28,18 +28,15 @@
     cleanup_remaining_deployments,
 )
 from utils.genai_perf import benchmark_decode, benchmark_prefill
-from utils.plot import (
-    plot_decode_performance,
-    plot_prefill_performance,
-)
+from utils.plot import plot_decode_performance, plot_prefill_performance
 from utils.profile_cache import (
     check_decode_results_exist,
     check_prefill_results_exist,
     load_existing_decode_results,
     load_existing_prefill_results,
 )
-from utils.profile_prefill import profile_prefill
 from utils.profile_deocde import profile_decode
+from utils.profile_prefill import profile_prefill
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -419,8 +416,12 @@ async def run_profile(args):
         base_url = client.get_service_url()
 
         profile_prefill(
-            work_dir, model_name, base_url, best_prefill_tp, 
-            args.max_context_length, args.prefill_interpolation_granularity,
+            work_dir,
+            model_name,
+            base_url,
+            best_prefill_tp,
+            args.max_context_length,
+            args.prefill_interpolation_granularity,
         )
 
         print("Cleaning up deployment...")
@@ -468,8 +469,13 @@ async def run_profile(args):
         base_url = client.get_service_url()
 
         profile_decode(
-            work_dir, model_name, base_url, best_decode_tp, max_kv_tokens,
-            args.max_context_length, args.decode_interpolation_granularity
+            work_dir,
+            model_name,
+            base_url,
+            best_decode_tp,
+            max_kv_tokens,
+            args.max_context_length,
+            args.decode_interpolation_granularity,
         )
 
         print("Cleaning up deployment...")
diff --git a/benchmarks/profiler/utils/plot.py b/benchmarks/profiler/utils/plot.py
@@ -160,7 +160,9 @@ def plot_prefill_interpolation(
     plt.close()
 
 
-def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir):
+def plot_decode_3d_surface(
+    x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
+):
     """
     Plot 3D surface for decode interpolation with KV usage, context length, and ITL.
 
@@ -175,7 +177,9 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu,
     yi = np.linspace(min(y_context_length), max(y_context_length), 100)
     X, Y = np.meshgrid(xi, yi)
     Z_itl = griddata((x_kv_usage, y_context_length), z_itl, (X, Y), method="cubic")
-    Z_thpt = griddata((x_kv_usage, y_context_length), z_thpt_per_gpu, (X, Y), method="cubic")
+    Z_thpt = griddata(
+        (x_kv_usage, y_context_length), z_thpt_per_gpu, (X, Y), method="cubic"
+    )
 
     # Plot ITL surface
     fig = plt.figure(figsize=(12, 10))
diff --git a/benchmarks/profiler/utils/profile_deocde.py b/benchmarks/profiler/utils/profile_deocde.py
@@ -1,11 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import numpy as np
 import logging
 
-from utils.plot import plot_decode_3d_surface
+import numpy as np
 from utils.genai_perf import benchmark_decode
+from utils.plot import plot_decode_3d_surface
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -18,15 +18,23 @@
 logger.addHandler(console_handler)
 
 
-def profile_decode(work_dir, model_name, url, num_gpus, max_kv_tokens, max_context_length, interpolation_granularity):
+def profile_decode(
+    work_dir,
+    model_name,
+    url,
+    num_gpus,
+    max_kv_tokens,
+    max_context_length,
+    interpolation_granularity,
+):
     """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
     x_kv_usage = []
     y_context_length = []
     z_itl = []
     z_thpt_per_gpu = []
-    
+
     osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    
+
     for isl in range(
         100,
         max_context_length - osl,
@@ -41,9 +49,7 @@ def profile_decode(work_dir, model_name, url, num_gpus, max_kv_tokens, max_conte
             )
         )
         for num_request in sweep_num_request:
-            genai_perf_artifact_dir = (
-                f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-            )
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
             gap_result = benchmark_decode(
                 isl,
                 osl,
diff --git a/benchmarks/profiler/utils/profile_prefill.py b/benchmarks/profiler/utils/profile_prefill.py
@@ -1,11 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import numpy as np
 import logging
 
-from utils.plot import plot_prefill_interpolation
+import numpy as np
 from utils.genai_perf import benchmark_prefill
+from utils.plot import plot_prefill_interpolation
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -18,7 +18,9 @@
 logger.addHandler(console_handler)
 
 
-def profile_prefill(work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity):
+def profile_prefill(
+    work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity
+):
     prefill_isl = []
     prefill_ttft = []
     prefill_thpt_per_gpu = []