Skip to content

Commit 3275ab2

Browse files
tedzhouhkkrishung5
authored andcommitted
feat: standalone profiling script for a given endpoint (#2386)
1 parent 602eccf commit 3275ab2

File tree

5 files changed

+326
-121
lines changed

5 files changed

+326
-121
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import argparse
5+
import logging
6+
import os
7+
8+
from utils.profile_prefill import profile_prefill
9+
10+
from benchmarks.profiler.utils.profile_decode import profile_decode
11+
12+
logger = logging.getLogger(__name__)
13+
logger.setLevel(logging.INFO)
14+
console_handler = logging.StreamHandler()
15+
console_handler.setLevel(logging.INFO)
16+
formatter = logging.Formatter(
17+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
18+
)
19+
console_handler.setFormatter(formatter)
20+
logger.addHandler(console_handler)
21+
22+
if __name__ == "__main__":
23+
parser = argparse.ArgumentParser(
24+
description="profile a given endpoint's performance for prefill or decode"
25+
)
26+
parser.add_argument(
27+
"--mode",
28+
type=str,
29+
required=True,
30+
choices=["prefill", "decode"],
31+
help="mode to profile",
32+
)
33+
parser.add_argument(
34+
"--model_name",
35+
type=str,
36+
required=True,
37+
help="model name",
38+
)
39+
parser.add_argument(
40+
"--url",
41+
type=str,
42+
required=True,
43+
help="base url of the endpoint",
44+
)
45+
parser.add_argument(
46+
"--num_gpus",
47+
type=int,
48+
required=True,
49+
help="number of gpus",
50+
)
51+
parser.add_argument(
52+
"--max_kv_tokens",
53+
type=int,
54+
required=False,
55+
default=0,
56+
help="max kv tokens of the endpoint (only used for decode)",
57+
)
58+
parser.add_argument(
59+
"--work_dir",
60+
type=str,
61+
default="endpoint_profiling_results/",
62+
help="work directory to save the results",
63+
)
64+
parser.add_argument(
65+
"--max_context_length",
66+
type=int,
67+
default=16384,
68+
help="max context length of the endpoint",
69+
)
70+
parser.add_argument(
71+
"--interpolation_granularity",
72+
type=int,
73+
default=8,
74+
help="interpolation granularity for the results",
75+
)
76+
args = parser.parse_args()
77+
78+
os.makedirs(args.work_dir, exist_ok=True)
79+
if args.mode == "prefill":
80+
profile_prefill(
81+
args.work_dir,
82+
args.model_name,
83+
args.url,
84+
args.num_gpus,
85+
args.max_context_length,
86+
args.interpolation_granularity,
87+
)
88+
elif args.mode == "decode":
89+
assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode"
90+
profile_decode(
91+
args.work_dir,
92+
args.model_name,
93+
args.url,
94+
args.num_gpus,
95+
args.max_kv_tokens,
96+
args.max_context_length,
97+
args.interpolation_granularity,
98+
)
99+
else:
100+
raise ValueError(f"Invalid mode: {args.mode}")

benchmarks/profiler/profile_sla.py

Lines changed: 22 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,16 @@
2828
cleanup_remaining_deployments,
2929
)
3030
from utils.genai_perf import benchmark_decode, benchmark_prefill
31-
from utils.plot import (
32-
plot_decode_3d_surface,
33-
plot_decode_performance,
34-
plot_prefill_interpolation,
35-
plot_prefill_performance,
36-
)
31+
from utils.plot import plot_decode_performance, plot_prefill_performance
3732
from utils.profile_cache import (
3833
check_decode_results_exist,
3934
check_prefill_results_exist,
4035
load_existing_decode_results,
4136
load_existing_prefill_results,
4237
)
38+
from utils.profile_prefill import profile_prefill
39+
40+
from benchmarks.profiler.utils.profile_decode import profile_decode
4341

4442
logger = logging.getLogger(__name__)
4543
logger.setLevel(logging.INFO)
@@ -373,9 +371,6 @@ async def run_profile(args):
373371

374372
# interpolate ISL - TTFT with best prefill TP
375373
best_prefill_tp = prefill_tp_size[selected_prefill_idx]
376-
prefill_isl = []
377-
prefill_ttft = []
378-
prefill_thpt_per_gpu = []
379374
logger.info(
380375
f"Profiling prefill under best TP {best_prefill_tp} with different ISL..."
381376
)
@@ -420,58 +415,22 @@ async def run_profile(args):
420415
)
421416

422417
base_url = client.get_service_url()
423-
for isl in range(
424-
100,
418+
419+
profile_prefill(
420+
work_dir,
421+
model_name,
422+
base_url,
423+
best_prefill_tp,
425424
args.max_context_length,
426-
(args.max_context_length - 100) // args.prefill_interpolation_granularity,
427-
):
428-
# run genai-perf
429-
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
430-
gap_result = benchmark_prefill(
431-
isl, genai_perf_artifact_dir, model_name, base_url=base_url
432-
)
433-
if gap_result is not None:
434-
ttft = gap_result["time_to_first_token"]["avg"]
435-
prefill_isl.append(isl)
436-
prefill_ttft.append(ttft)
437-
prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
425+
args.prefill_interpolation_granularity,
426+
)
438427

439428
print("Cleaning up deployment...")
440429
await client.delete_deployment()
441430
deployment_clients.remove(client)
442431
print("Deployment deleted")
443432

444-
# Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
445-
if len(prefill_isl) > 2:
446-
logger.info("Interpolating prefill TTFT and throughput vs ISL...")
447-
448-
# Convert to numpy arrays for easier manipulation
449-
prefill_isl_np = np.array(prefill_isl)
450-
prefill_ttft_np = np.array(prefill_ttft)
451-
prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
452-
453-
save_path = f"{work_dir}/raw_data.npz"
454-
np.savez(
455-
save_path,
456-
prefill_isl=prefill_isl_np,
457-
prefill_ttft=prefill_ttft_np,
458-
prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
459-
)
460-
461-
# Call the plotting function
462-
plot_prefill_interpolation(
463-
prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
464-
)
465-
else:
466-
logger.warning(
467-
"Not enough data points to perform interpolation (need at least 3 points)"
468-
)
469-
470433
# interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
471-
x_kv_usage = []
472-
y_context_length = []
473-
z_itl = []
474-
z_thpt_per_gpu = []
475434
best_decode_tp = decode_tp_size[selected_decode_idx]
476435
logger.info(f"Profiling decode with TP size {best_decode_tp}...")
477436
decode_config = config_modifier.set_config_tp_size(
@@ -508,64 +467,23 @@ async def run_profile(args):
508467
f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
509468
)
510469

511-
osl = 500 # not too large to reduce ITL variance, not too small to have stable measurement
512470
base_url = client.get_service_url()
513-
for isl in range(
514-
100,
515-
args.max_context_length - osl,
516-
(args.max_context_length - osl) // args.decode_interpolation_granularity,
517-
):
518-
max_concurrency = max_kv_tokens // (isl + osl)
519-
sweep_num_request = list(
520-
range(
521-
1,
522-
max_concurrency,
523-
max_concurrency // args.decode_interpolation_granularity,
524-
)
525-
)
526-
for num_request in sweep_num_request:
527-
genai_perf_artifact_dir = (
528-
f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
529-
)
530-
gap_result = benchmark_decode(
531-
isl,
532-
osl,
533-
num_request,
534-
genai_perf_artifact_dir,
535-
model_name,
536-
base_url=base_url,
537-
)
538-
if gap_result is not None:
539-
itl = gap_result["inter_token_latency"]["avg"]
540-
x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
541-
y_context_length.append(isl + osl / 2)
542-
z_itl.append(itl)
543-
z_thpt_per_gpu.append(
544-
gap_result["output_token_throughput"]["avg"] / best_decode_tp
545-
)
471+
472+
profile_decode(
473+
work_dir,
474+
model_name,
475+
base_url,
476+
best_decode_tp,
477+
max_kv_tokens,
478+
args.max_context_length,
479+
args.decode_interpolation_granularity,
480+
)
546481

547482
print("Cleaning up deployment...")
548483
await client.delete_deployment()
549484
deployment_clients.remove(client)
550485
print("Deployment deleted")
551486

552-
# Save the data points to a .npz file
553-
save_path = f"{work_dir}/raw_data.npz"
554-
np.savez(
555-
save_path,
556-
x_kv_usage=np.array(x_kv_usage),
557-
y_context_length=np.array(y_context_length),
558-
z_itl=np.array(z_itl),
559-
z_thpt_per_gpu=np.array(z_thpt_per_gpu),
560-
max_kv_tokens=np.array([max_kv_tokens]),
561-
)
562-
logger.info(f"Saved data points to {save_path}")
563-
564-
# Plot 3D surface
565-
plot_decode_3d_surface(
566-
x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
567-
)
568-
569487
except Exception as e:
570488
logger.error(f"Profile job failed with error: {e}")
571489
raise

0 commit comments

Comments
 (0)