|
20 | 20 | BenchmarkResult,
|
21 | 21 | clean_caches,
|
22 | 22 | create_model_and_input,
|
| 23 | + generate_memory_profile, |
| 24 | + generate_model_profile, |
23 | 25 | model_inference_time_in_ms,
|
24 | 26 | string_to_config,
|
25 | 27 | )
|
|
29 | 31 |
|
30 | 32 | def run(config: BenchmarkConfig) -> BenchmarkResult:
|
31 | 33 | """Run inference benchmarks"""
|
32 |
| - clean_caches() # Clean caches |
33 |
| - |
34 |
| - # Create output directory if it doesn't exist |
35 |
| - Path(config.output_dir).mkdir(parents=True, exist_ok=True) |
36 |
| - |
37 |
| - base_model, input_data = create_model_and_input( |
38 |
| - config.model_type, |
39 |
| - config.m, |
40 |
| - config.k, |
41 |
| - config.n, |
42 |
| - high_precision_dtype=config.high_precision_dtype, |
43 |
| - device=config.device, |
44 |
| - ) |
45 |
| - |
46 |
| - # Use quantize_ to apply each quantization function to the model |
47 |
| - m_copy = deepcopy(base_model).eval().to(config.device) |
48 |
| - ao_base_config = string_to_config( |
49 |
| - config.quantization, |
50 |
| - config.sparsity, |
51 |
| - high_precision_dtype=config.high_precision_dtype, |
52 |
| - ) |
53 |
| - |
54 |
| - # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) |
55 |
| - is_cuda = config.device == "cuda" and torch.cuda.is_available() |
56 |
| - |
57 |
| - if config.sparsity is not None and ( |
58 |
| - config.quantization is None or "baseline" in config.quantization |
59 |
| - ): |
60 |
| - if is_cuda: |
61 |
| - print(f"Applying {config.sparsity} sparsity to model") |
62 |
| - sparsify_(m_copy, ao_base_config) |
| 34 | + try: |
| 35 | + clean_caches() # Clean caches |
| 36 | + |
| 37 | + # Create output directory if it doesn't exist |
| 38 | + Path(config.output_dir).mkdir(parents=True, exist_ok=True) |
| 39 | + |
| 40 | + base_model, input_data = create_model_and_input( |
| 41 | + config.model_type, |
| 42 | + config.m, |
| 43 | + config.k, |
| 44 | + config.n, |
| 45 | + high_precision_dtype=config.high_precision_dtype, |
| 46 | + device=config.device, |
| 47 | + ) |
| 48 | + |
| 49 | + # Use quantize_ to apply each quantization function to the model |
| 50 | + m_copy = deepcopy(base_model).eval().to(config.device) |
| 51 | + ao_base_config = string_to_config( |
| 52 | + config.quantization, |
| 53 | + config.sparsity, |
| 54 | + high_precision_dtype=config.high_precision_dtype, |
| 55 | + ) |
| 56 | + |
| 57 | + # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA) |
| 58 | + is_cuda = config.device == "cuda" and torch.cuda.is_available() |
| 59 | + |
| 60 | + if config.sparsity is not None and ( |
| 61 | + config.quantization is None or "baseline" in config.quantization |
| 62 | + ): |
| 63 | + if is_cuda: |
| 64 | + print(f"Applying {config.sparsity} sparsity to model") |
| 65 | + sparsify_(m_copy, ao_base_config) |
| 66 | + else: |
| 67 | + print( |
| 68 | + f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" |
| 69 | + ) |
| 70 | + elif config.sparsity is None and ( |
| 71 | + config.quantization is None or "baseline" in config.quantization |
| 72 | + ): |
| 73 | + pass # No quantization or sparsity specified, do nothing |
63 | 74 | else:
|
64 |
| - print( |
65 |
| - f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}" |
| 75 | + print("Quantizing model....") |
| 76 | + quantize_(m_copy, ao_base_config) |
| 77 | + |
| 78 | + if config.use_torch_compile: |
| 79 | + print("Compiling model....") |
| 80 | + m_copy = torch.compile( |
| 81 | + m_copy, mode=config.torch_compile_mode, fullgraph=True |
66 | 82 | )
|
67 |
| - elif config.sparsity is None and ( |
68 |
| - config.quantization is None or "baseline" in config.quantization |
69 |
| - ): |
70 |
| - pass # No quantization or sparsity specified, do nothing |
71 |
| - else: |
72 |
| - print("Quantizing model....") |
73 |
| - quantize_(m_copy, ao_base_config) |
74 |
| - |
75 |
| - if config.use_torch_compile: |
76 |
| - print("Compiling model....") |
77 |
| - m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True) |
78 |
| - |
79 |
| - # Run benchmarks |
80 |
| - result = BenchmarkResult(config=config) |
81 |
| - |
82 |
| - # Benchmark time to run an inference call for quantized model |
83 |
| - result.model_inference_time_in_ms = model_inference_time_in_ms( |
84 |
| - model=m_copy, input_data=input_data |
85 |
| - ) |
86 |
| - |
87 |
| - # TODO: Benchmark time using profiler |
88 |
| - # Profile dtype model evaluation |
89 |
| - # prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype) |
90 |
| - # prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details |
91 |
| - |
92 |
| - # TODO: Benchmark gemm time using cuda graph |
93 |
| - # gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs) |
94 |
| - |
95 |
| - # TODO: Benchmark op with cuda graph |
96 |
| - # time = benchmark_op_with_cuda_graph(op, args) |
97 |
| - |
98 |
| - return result |
| 83 | + |
| 84 | + # Run benchmarks |
| 85 | + result = BenchmarkResult(config=config) |
| 86 | + # Store result in model for memory profiling |
| 87 | + m_copy._benchmark_result = result |
| 88 | + |
| 89 | + # Benchmark time to run an inference call for quantized model |
| 90 | + result.model_inference_time_in_ms = model_inference_time_in_ms( |
| 91 | + model=m_copy, input_data=input_data |
| 92 | + ) |
| 93 | + |
| 94 | + # Run profiler if enabled |
| 95 | + if config.enable_profiler: |
| 96 | + print("Running profiler...") |
| 97 | + try: |
| 98 | + result.profiler_json_path, result.perfetto_url = generate_model_profile( |
| 99 | + m_copy, input_data, config.profiler_file_name |
| 100 | + ) |
| 101 | + except Exception as e: |
| 102 | + print(f"Error running profiler: {e}") |
| 103 | + |
| 104 | + # Run memory profiler if enabled |
| 105 | + if config.enable_memory_profile: |
| 106 | + print("Running memory profiler...") |
| 107 | + try: |
| 108 | + result.memory_profile_path, result.memory_stats = ( |
| 109 | + generate_memory_profile( |
| 110 | + m_copy, input_data, config.memory_profile_file_name |
| 111 | + ) |
| 112 | + ) |
| 113 | + except Exception as e: |
| 114 | + print(f"Error running memory profiler: {e}") |
| 115 | + |
| 116 | + return result |
| 117 | + except Exception as e: |
| 118 | + print(f"Error in benchmark run: {e}") |
| 119 | + import traceback |
| 120 | + |
| 121 | + print(traceback.format_exc()) |
| 122 | + return None |
0 commit comments