pytorch · jainapurva · Apr 1, 2025
diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -12,89 +12,111 @@
 
 from copy import deepcopy
 from pathlib import Path
-import os
-import subprocess
-import uuid
 
 import torch
-from torch.profiler import profile, record_function, ProfilerActivity
 
 from benchmarks.microbenchmarks.utils import (
     BenchmarkConfig,
     BenchmarkResult,
     clean_caches,
     create_model_and_input,
+    generate_memory_profile,
+    generate_model_profile,
     model_inference_time_in_ms,
     string_to_config,
-    generate_model_profile,
 )
 from torchao.quantization import quantize_
 from torchao.sparsity.sparse_api import sparsify_
 
 
 def run(config: BenchmarkConfig) -> BenchmarkResult:
     """Run inference benchmarks"""
-    clean_caches()  # Clean caches
-
-    # Create output directory if it doesn't exist
-    Path(config.output_dir).mkdir(parents=True, exist_ok=True)
-
-    base_model, input_data = create_model_and_input(
-        config.model_type,
-        config.m,
-        config.k,
-        config.n,
-        high_precision_dtype=config.high_precision_dtype,
-        device=config.device,
-    )
-
-    # Use quantize_ to apply each quantization function to the model
-    m_copy = deepcopy(base_model).eval().to(config.device)
-    ao_base_config = string_to_config(
-        config.quantization,
-        config.sparsity,
-        high_precision_dtype=config.high_precision_dtype,
-    )
-
-    # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
-    is_cuda = config.device == "cuda" and torch.cuda.is_available()
-
-    if config.sparsity is not None and (
-        config.quantization is None or "baseline" in config.quantization
-    ):
-        if is_cuda:
-            print(f"Applying {config.sparsity} sparsity to model")
-            sparsify_(m_copy, ao_base_config)
+    try:
+        clean_caches()  # Clean caches
+
+        # Create output directory if it doesn't exist
+        Path(config.output_dir).mkdir(parents=True, exist_ok=True)
+
+        base_model, input_data = create_model_and_input(
+            config.model_type,
+            config.m,
+            config.k,
+            config.n,
+            high_precision_dtype=config.high_precision_dtype,
+            device=config.device,
+        )
+
+        # Use quantize_ to apply each quantization function to the model
+        m_copy = deepcopy(base_model).eval().to(config.device)
+        ao_base_config = string_to_config(
+            config.quantization,
+            config.sparsity,
+            high_precision_dtype=config.high_precision_dtype,
+        )
+
+        # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
+        is_cuda = config.device == "cuda" and torch.cuda.is_available()
+
+        if config.sparsity is not None and (
+            config.quantization is None or "baseline" in config.quantization
+        ):
+            if is_cuda:
+                print(f"Applying {config.sparsity} sparsity to model")
+                sparsify_(m_copy, ao_base_config)
+            else:
+                print(
+                    f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
+                )
+        elif config.sparsity is None and (
+            config.quantization is None or "baseline" in config.quantization
+        ):
+            pass  # No quantization or sparsity specified, do nothing
         else:
-            print(
-                f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
+            print("Quantizing model....")
+            quantize_(m_copy, ao_base_config)
+
+        if config.use_torch_compile:
+            print("Compiling model....")
+            m_copy = torch.compile(
+                m_copy, mode=config.torch_compile_mode, fullgraph=True
             )
-    elif config.sparsity is None and (
-        config.quantization is None or "baseline" in config.quantization
-    ):
-        pass  # No quantization or sparsity specified, do nothing
-    else:
-        print("Quantizing model....")
-        quantize_(m_copy, ao_base_config)
-
-    if config.use_torch_compile:
-        print("Compiling model....")
-        m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
-
-    # Run benchmarks
-    result = BenchmarkResult(config=config)
-
-    # Benchmark time to run an inference call for quantized model
-    result.model_inference_time_in_ms = model_inference_time_in_ms(
-        model=m_copy, input_data=input_data
-    )
-
-    # Run profiler if enabled
-    if config.enable_profiler:
-        print("Running profiler...")
-        try:
-            result.profiler_json_path, result.perfetto_url = generate_model_profile(m_copy, input_data, config.profiler_file_name)
-        except Exception as e:
-            print(f"Error running profiler: {e}")
-
-    return result
+
+        # Run benchmarks
+        result = BenchmarkResult(config=config)
+        # Store result in model for memory profiling
+        m_copy._benchmark_result = result
+
+        # Benchmark time to run an inference call for quantized model
+        result.model_inference_time_in_ms = model_inference_time_in_ms(
+            model=m_copy, input_data=input_data
+        )
+
+        # Run profiler if enabled
+        if config.enable_profiler:
+            print("Running profiler...")
+            try:
+                result.profiler_json_path, result.perfetto_url = generate_model_profile(
+                    m_copy, input_data, config.profiler_file_name
+                )
+            except Exception as e:
+                print(f"Error running profiler: {e}")
+
+        # Run memory profiler if enabled
+        if config.enable_memory_profile:
+            print("Running memory profiler...")
+            try:
+                result.memory_profile_path, result.memory_stats = (
+                    generate_memory_profile(
+                        m_copy, input_data, config.memory_profile_file_name
+                    )
+                )
+            except Exception as e:
+                print(f"Error running memory profiler: {e}")
+
+        return result
+    except Exception as e:
+        print(f"Error in benchmark run: {e}")
+        import traceback
+
+        print(traceback.format_exc())
+        return None
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -164,9 +164,11 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
                 f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
             )
             result = run_inference(config)  # Pass the config object directly
-            results.append(result)
+            if result is not None:  # Only add successful results
+                results.append(result)
         except Exception as e:
             import traceback
+
             print(f"Error running benchmark {config.name} with error: {e}")
             print(traceback.format_exc())
             continue

diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -36,7 +36,8 @@ model_params:
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
-    enable_profiler: true  # Disable profiling for this model
+    enable_profiler: true  # Enable profiling for this model
+    enable_memory_profile: true  # Enable memory profiling for this model
 
   # - name: "cpu_fp32_linear"
   #   matrix_shapes: