Skip to content

Commit 80c8d0a

Browse files
committed
pc
1 parent 5a2626b commit 80c8d0a

File tree

5 files changed

+60
-27
lines changed

5 files changed

+60
-27
lines changed

benchmarks/profiler/profile_endpoint.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import argparse
55
import logging
66

7-
from utils.profile_prefill import profile_prefill
87
from utils.profile_deocde import profile_decode
8+
from utils.profile_prefill import profile_prefill
99

1010
logger = logging.getLogger(__name__)
1111
logger.setLevel(logging.INFO)
@@ -72,11 +72,26 @@
7272
help="interpolation granularity for the results",
7373
)
7474
args = parser.parse_args()
75-
75+
7676
if args.mode == "prefill":
77-
profile_prefill(args.work_dir, args.model_name, args.url, args.num_gpus, args.max_context_length, args.interpolation_granularity)
77+
profile_prefill(
78+
args.work_dir,
79+
args.model_name,
80+
args.url,
81+
args.num_gpus,
82+
args.max_context_length,
83+
args.interpolation_granularity,
84+
)
7885
elif args.mode == "decode":
7986
assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode"
80-
profile_decode(args.work_dir, args.model_name, args.url, args.num_gpus, args.max_kv_tokens, args.max_context_length, args.interpolation_granularity)
87+
profile_decode(
88+
args.work_dir,
89+
args.model_name,
90+
args.url,
91+
args.num_gpus,
92+
args.max_kv_tokens,
93+
args.max_context_length,
94+
args.interpolation_granularity,
95+
)
8196
else:
82-
raise ValueError(f"Invalid mode: {args.mode}")
97+
raise ValueError(f"Invalid mode: {args.mode}")

benchmarks/profiler/profile_sla.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,15 @@
2828
cleanup_remaining_deployments,
2929
)
3030
from utils.genai_perf import benchmark_decode, benchmark_prefill
31-
from utils.plot import (
32-
plot_decode_performance,
33-
plot_prefill_performance,
34-
)
31+
from utils.plot import plot_decode_performance, plot_prefill_performance
3532
from utils.profile_cache import (
3633
check_decode_results_exist,
3734
check_prefill_results_exist,
3835
load_existing_decode_results,
3936
load_existing_prefill_results,
4037
)
41-
from utils.profile_prefill import profile_prefill
4238
from utils.profile_deocde import profile_decode
39+
from utils.profile_prefill import profile_prefill
4340

4441
logger = logging.getLogger(__name__)
4542
logger.setLevel(logging.INFO)
@@ -419,8 +416,12 @@ async def run_profile(args):
419416
base_url = client.get_service_url()
420417

421418
profile_prefill(
422-
work_dir, model_name, base_url, best_prefill_tp,
423-
args.max_context_length, args.prefill_interpolation_granularity,
419+
work_dir,
420+
model_name,
421+
base_url,
422+
best_prefill_tp,
423+
args.max_context_length,
424+
args.prefill_interpolation_granularity,
424425
)
425426

426427
print("Cleaning up deployment...")
@@ -468,8 +469,13 @@ async def run_profile(args):
468469
base_url = client.get_service_url()
469470

470471
profile_decode(
471-
work_dir, model_name, base_url, best_decode_tp, max_kv_tokens,
472-
args.max_context_length, args.decode_interpolation_granularity
472+
work_dir,
473+
model_name,
474+
base_url,
475+
best_decode_tp,
476+
max_kv_tokens,
477+
args.max_context_length,
478+
args.decode_interpolation_granularity,
473479
)
474480

475481
print("Cleaning up deployment...")

benchmarks/profiler/utils/plot.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,9 @@ def plot_prefill_interpolation(
160160
plt.close()
161161

162162

163-
def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir):
163+
def plot_decode_3d_surface(
164+
x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
165+
):
164166
"""
165167
Plot 3D surface for decode interpolation with KV usage, context length, and ITL.
166168
@@ -175,7 +177,9 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu,
175177
yi = np.linspace(min(y_context_length), max(y_context_length), 100)
176178
X, Y = np.meshgrid(xi, yi)
177179
Z_itl = griddata((x_kv_usage, y_context_length), z_itl, (X, Y), method="cubic")
178-
Z_thpt = griddata((x_kv_usage, y_context_length), z_thpt_per_gpu, (X, Y), method="cubic")
180+
Z_thpt = griddata(
181+
(x_kv_usage, y_context_length), z_thpt_per_gpu, (X, Y), method="cubic"
182+
)
179183

180184
# Plot ITL surface
181185
fig = plt.figure(figsize=(12, 10))

benchmarks/profiler/utils/profile_deocde.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import numpy as np
54
import logging
65

7-
from utils.plot import plot_decode_3d_surface
6+
import numpy as np
87
from utils.genai_perf import benchmark_decode
8+
from utils.plot import plot_decode_3d_surface
99

1010
logger = logging.getLogger(__name__)
1111
logger.setLevel(logging.INFO)
@@ -18,15 +18,23 @@
1818
logger.addHandler(console_handler)
1919

2020

21-
def profile_decode(work_dir, model_name, url, num_gpus, max_kv_tokens, max_context_length, interpolation_granularity):
21+
def profile_decode(
22+
work_dir,
23+
model_name,
24+
url,
25+
num_gpus,
26+
max_kv_tokens,
27+
max_context_length,
28+
interpolation_granularity,
29+
):
2230
"""interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
2331
x_kv_usage = []
2432
y_context_length = []
2533
z_itl = []
2634
z_thpt_per_gpu = []
27-
35+
2836
osl = 500 # not too large to reduce ITL variance, not too small to have stable measurement
29-
37+
3038
for isl in range(
3139
100,
3240
max_context_length - osl,
@@ -41,9 +49,7 @@ def profile_decode(work_dir, model_name, url, num_gpus, max_kv_tokens, max_conte
4149
)
4250
)
4351
for num_request in sweep_num_request:
44-
genai_perf_artifact_dir = (
45-
f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
46-
)
52+
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
4753
gap_result = benchmark_decode(
4854
isl,
4955
osl,

benchmarks/profiler/utils/profile_prefill.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import numpy as np
54
import logging
65

7-
from utils.plot import plot_prefill_interpolation
6+
import numpy as np
87
from utils.genai_perf import benchmark_prefill
8+
from utils.plot import plot_prefill_interpolation
99

1010
logger = logging.getLogger(__name__)
1111
logger.setLevel(logging.INFO)
@@ -18,7 +18,9 @@
1818
logger.addHandler(console_handler)
1919

2020

21-
def profile_prefill(work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity):
21+
def profile_prefill(
22+
work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity
23+
):
2224
prefill_isl = []
2325
prefill_ttft = []
2426
prefill_thpt_per_gpu = []

0 commit comments

Comments
 (0)