|
28 | 28 | cleanup_remaining_deployments, |
29 | 29 | ) |
30 | 30 | from utils.genai_perf import benchmark_decode, benchmark_prefill |
31 | | -from utils.plot import ( |
32 | | - plot_decode_3d_surface, |
33 | | - plot_decode_performance, |
34 | | - plot_prefill_interpolation, |
35 | | - plot_prefill_performance, |
36 | | -) |
| 31 | +from utils.plot import plot_decode_performance, plot_prefill_performance |
37 | 32 | from utils.profile_cache import ( |
38 | 33 | check_decode_results_exist, |
39 | 34 | check_prefill_results_exist, |
40 | 35 | load_existing_decode_results, |
41 | 36 | load_existing_prefill_results, |
42 | 37 | ) |
| 38 | +from utils.profile_prefill import profile_prefill |
| 39 | + |
| 40 | +from benchmarks.profiler.utils.profile_decode import profile_decode |
43 | 41 |
|
44 | 42 | logger = logging.getLogger(__name__) |
45 | 43 | logger.setLevel(logging.INFO) |
@@ -373,9 +371,6 @@ async def run_profile(args): |
373 | 371 |
|
374 | 372 | # interpolate ISL - TTFT with best prefill TP |
375 | 373 | best_prefill_tp = prefill_tp_size[selected_prefill_idx] |
376 | | - prefill_isl = [] |
377 | | - prefill_ttft = [] |
378 | | - prefill_thpt_per_gpu = [] |
379 | 374 | logger.info( |
380 | 375 | f"Profiling prefill under best TP {best_prefill_tp} with different ISL..." |
381 | 376 | ) |
@@ -420,58 +415,22 @@ async def run_profile(args): |
420 | 415 | ) |
421 | 416 |
|
422 | 417 | base_url = client.get_service_url() |
423 | | - for isl in range( |
424 | | - 100, |
| 418 | + |
| 419 | + profile_prefill( |
| 420 | + work_dir, |
| 421 | + model_name, |
| 422 | + base_url, |
| 423 | + best_prefill_tp, |
425 | 424 | args.max_context_length, |
426 | | - (args.max_context_length - 100) // args.prefill_interpolation_granularity, |
427 | | - ): |
428 | | - # run genai-perf |
429 | | - genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}" |
430 | | - gap_result = benchmark_prefill( |
431 | | - isl, genai_perf_artifact_dir, model_name, base_url=base_url |
432 | | - ) |
433 | | - if gap_result is not None: |
434 | | - ttft = gap_result["time_to_first_token"]["avg"] |
435 | | - prefill_isl.append(isl) |
436 | | - prefill_ttft.append(ttft) |
437 | | - prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000) |
| 425 | + args.prefill_interpolation_granularity, |
| 426 | + ) |
438 | 427 |
|
439 | 428 | print("Cleaning up deployment...") |
440 | 429 | await client.delete_deployment() |
441 | 430 | deployment_clients.remove(client) |
442 | 431 | print("Deployment deleted") |
443 | 432 |
|
444 | | - # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c) |
445 | | - if len(prefill_isl) > 2: |
446 | | - logger.info("Interpolating prefill TTFT and throughput vs ISL...") |
447 | | - |
448 | | - # Convert to numpy arrays for easier manipulation |
449 | | - prefill_isl_np = np.array(prefill_isl) |
450 | | - prefill_ttft_np = np.array(prefill_ttft) |
451 | | - prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu) |
452 | | - |
453 | | - save_path = f"{work_dir}/raw_data.npz" |
454 | | - np.savez( |
455 | | - save_path, |
456 | | - prefill_isl=prefill_isl_np, |
457 | | - prefill_ttft=prefill_ttft_np, |
458 | | - prefill_thpt_per_gpu=prefill_thpt_per_gpu_np, |
459 | | - ) |
460 | | - |
461 | | - # Call the plotting function |
462 | | - plot_prefill_interpolation( |
463 | | - prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir |
464 | | - ) |
465 | | - else: |
466 | | - logger.warning( |
467 | | - "Not enough data points to perform interpolation (need at least 3 points)" |
468 | | - ) |
469 | | - |
470 | 433 | # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP |
471 | | - x_kv_usage = [] |
472 | | - y_context_length = [] |
473 | | - z_itl = [] |
474 | | - z_thpt_per_gpu = [] |
475 | 434 | best_decode_tp = decode_tp_size[selected_decode_idx] |
476 | 435 | logger.info(f"Profiling decode with TP size {best_decode_tp}...") |
477 | 436 | decode_config = config_modifier.set_config_tp_size( |
@@ -508,64 +467,23 @@ async def run_profile(args): |
508 | 467 | f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log" |
509 | 468 | ) |
510 | 469 |
|
511 | | - osl = 500 # not too large to reduce ITL variance, not too small to have stable measurement |
512 | 470 | base_url = client.get_service_url() |
513 | | - for isl in range( |
514 | | - 100, |
515 | | - args.max_context_length - osl, |
516 | | - (args.max_context_length - osl) // args.decode_interpolation_granularity, |
517 | | - ): |
518 | | - max_concurrency = max_kv_tokens // (isl + osl) |
519 | | - sweep_num_request = list( |
520 | | - range( |
521 | | - 1, |
522 | | - max_concurrency, |
523 | | - max_concurrency // args.decode_interpolation_granularity, |
524 | | - ) |
525 | | - ) |
526 | | - for num_request in sweep_num_request: |
527 | | - genai_perf_artifact_dir = ( |
528 | | - f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}" |
529 | | - ) |
530 | | - gap_result = benchmark_decode( |
531 | | - isl, |
532 | | - osl, |
533 | | - num_request, |
534 | | - genai_perf_artifact_dir, |
535 | | - model_name, |
536 | | - base_url=base_url, |
537 | | - ) |
538 | | - if gap_result is not None: |
539 | | - itl = gap_result["inter_token_latency"]["avg"] |
540 | | - x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens) |
541 | | - y_context_length.append(isl + osl / 2) |
542 | | - z_itl.append(itl) |
543 | | - z_thpt_per_gpu.append( |
544 | | - gap_result["output_token_throughput"]["avg"] / best_decode_tp |
545 | | - ) |
| 471 | + |
| 472 | + profile_decode( |
| 473 | + work_dir, |
| 474 | + model_name, |
| 475 | + base_url, |
| 476 | + best_decode_tp, |
| 477 | + max_kv_tokens, |
| 478 | + args.max_context_length, |
| 479 | + args.decode_interpolation_granularity, |
| 480 | + ) |
546 | 481 |
|
547 | 482 | print("Cleaning up deployment...") |
548 | 483 | await client.delete_deployment() |
549 | 484 | deployment_clients.remove(client) |
550 | 485 | print("Deployment deleted") |
551 | 486 |
|
552 | | - # Save the data points to a .npz file |
553 | | - save_path = f"{work_dir}/raw_data.npz" |
554 | | - np.savez( |
555 | | - save_path, |
556 | | - x_kv_usage=np.array(x_kv_usage), |
557 | | - y_context_length=np.array(y_context_length), |
558 | | - z_itl=np.array(z_itl), |
559 | | - z_thpt_per_gpu=np.array(z_thpt_per_gpu), |
560 | | - max_kv_tokens=np.array([max_kv_tokens]), |
561 | | - ) |
562 | | - logger.info(f"Saved data points to {save_path}") |
563 | | - |
564 | | - # Plot 3D surface |
565 | | - plot_decode_3d_surface( |
566 | | - x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir |
567 | | - ) |
568 | | - |
569 | 487 | except Exception as e: |
570 | 488 | logger.error(f"Profile job failed with error: {e}") |
571 | 489 | raise |
|
0 commit comments