Remove double baseline calculations for CI microbenchmarks (#2613)

jainapurva · web-flow · commit 317179ea43ff · 2025-08-12T16:26:55.000-07:00
diff --git a/benchmarks/dashboard/ci_microbenchmark_runner.py b/benchmarks/dashboard/ci_microbenchmark_runner.py
@@ -125,7 +125,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Fwd Speedup (x)",
-                metric_values=[result.speedup],
+                metric_values=[result.compile_speedup_on_baseline],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -135,7 +135,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Bfloat16 Fwd Time (ms)",
-                metric_values=[result.baseline_inference_time_in_ms],
+                metric_values=[result.baseline_model_compiled_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -148,7 +148,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
                 benchmark_name="TorchAO Quantization Benchmark",
                 shape=[config.m, config.k, config.n],
                 metric_name="Quantized Fwd Time (ms)",
-                metric_values=[result.model_inference_time_in_ms],
+                metric_values=[result.quantized_model_compiled_inference_time_in_ms],
                 quant_type=config.quantization,
                 device=config.device,
                 torch_compile_mode=config.torch_compile_mode,
@@ -175,6 +175,7 @@ def run_ci_benchmarks(config_path: str) -> List[Dict[str, Any]]:
 
 
 def main():
+    torch.manual_seed(42)
     parser = argparse.ArgumentParser(
         description="Run microbenchmarks and output results in PyTorch OSS benchmark database format"
     )
diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml
@@ -14,7 +14,6 @@ model_params:
         min_power: 10
         max_power: 15
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
diff --git a/benchmarks/microbenchmarks/benchmark_inference.py b/benchmarks/microbenchmarks/benchmark_inference.py
@@ -13,6 +13,7 @@
 import os
 from copy import deepcopy
 from pathlib import Path
+from typing import Dict, Tuple
 
 import torch
 
@@ -34,15 +35,72 @@
     create_model_and_input_data,
 )
 
+# -----------------------------------------------------------------------------
+# Baseline caching
+#
+# ``_BASELINE_CACHE`` maps a unique key constructed using _make_cache_key(config) -> (model_type, m, k, n, high_precision_dtype, device, torch_compile_mode) to a tuple
+# ``(eager_baseline_time, compile_baseline_time)``.  See ``_make_cache_key`` for the key
+# construction.  Users should not access this cache directly; it is
+# internal to this module.
+# Eg: (linear, 1024, 1024, 1024, torch.bfloat16, cuda, default) -> (95.00, 56.00)
+# The cache is used to store the baseline inference time for a given configuration, which is further used to calculate speedup metrics.
+# This helps in removing multiple baseline calculations, which in turn helps in reducing the benchmarking time.
+# -----------------------------------------------------------------------------
+
+_BASELINE_CACHE: Dict[Tuple, Tuple[float, float]] = {}
+
+
+def _make_cache_key(config: BenchmarkConfig) -> Tuple:
+    """Create a key for caching based on benchmark configuration.
+
+    Parameters that affect baseline performance are included:
+
+    * model type (e.g. ``linear`` or ``transformer_block``)
+    * shape dimensions (m, k, n)
+    * high precision dtype (bf16, fp16, etc.)
+    * device (cuda, cpu, mps)
+    * compile settings (whether compile is enabled and compile mode)
+
+    Sparsity and quantization settings are deliberately excluded
+    because the baseline (non‑quantized, non‑sparse) performance is
+    independent of those attributes.
+    """
+    return (
+        config.model_type,
+        config.m,
+        config.k,
+        config.n,
+        config.high_precision_dtype,
+        config.device,
+        config.torch_compile_mode,
+    )
+
 
 def run(config: BenchmarkConfig) -> BenchmarkResult:
-    """Run inference benchmarks"""
+    """
+    Run inference benchmarks.
+
+    The function first checks if a baseline for the given configuration
+    already exists in the internal cache.  If not, it measures the baseline
+    inference time and stores the result.  When the baseline is cached,
+    the function reuses the cached baselines to calculate speedup metrics.
+
+    Args:
+        config (BenchmarkConfig): Benchmark configuration.
+
+    Returns:
+        BenchmarkResult: Result of the benchmark.
+    """
     try:
         clean_caches()  # Clean caches
 
         # Create output directory if it doesn't exist
         Path(config.output_dir).mkdir(parents=True, exist_ok=True)
 
+        # Prepare result container
+        result = BenchmarkResult(config=config)
+
+        # Create model and input data
         base_model, input_data = create_model_and_input_data(
             config.model_type,
             config.m,
@@ -51,28 +109,47 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
             high_precision_dtype=config.high_precision_dtype,
             device=config.device,
         )
-        # Copy base model for quantizing
-        m_copy = deepcopy(base_model)
 
-        # Run benchmarks
-        result = BenchmarkResult(config=config)
+        # Generate a cache key for the current configuration
+        cache_key = _make_cache_key(config)
 
-        # Store result in model for memory profiling
-        base_model._benchmark_result = result
-
-        # Run baseline benchmarking
-        base_model = base_model.eval().to(config.device)
-        if config.use_torch_compile:
-            print("Compiling baseline model....")
-            base_model = torch.compile(
-                base_model, mode=config.torch_compile_mode, fullgraph=True
+        # Check if the baseline for this configuration has been computed
+        if cache_key not in _BASELINE_CACHE:
+            # Switch model to eval and move to device
+            m_copy = deepcopy(base_model)
+            m_copy = m_copy.eval().to(config.device)
+            print("Benchmarking eager baseline inference.....")
+            eager_baseline_time = model_inference_time_in_ms(
+                model=m_copy, input_data=input_data
             )
-        # Benchmark time to run an inference call for baseline model
-        print("Benchmarking baseline inference.....")
-        result.baseline_inference_time_in_ms = model_inference_time_in_ms(
-            model=base_model, input_data=input_data
-        )
 
+            print("Benchmarking compile baseline inference.....")
+            m_copy = torch.compile(
+                m_copy, mode=config.torch_compile_mode, fullgraph=True
+            )
+            compile_baseline_time = model_inference_time_in_ms(
+                model=m_copy, input_data=input_data
+            )
+
+            # Store uncompiled model, input and baseline time
+            _BASELINE_CACHE[cache_key] = (eager_baseline_time, compile_baseline_time)
+
+            result.baseline_model_eager_inference_time_in_ms = eager_baseline_time
+            result.baseline_model_compiled_inference_time_in_ms = compile_baseline_time
+        else:
+            # Retrieve cached values
+            cached_eager_time, cached_compile_time = _BASELINE_CACHE[cache_key]
+            result.baseline_model_eager_inference_time_in_ms = cached_eager_time
+            result.baseline_model_compiled_inference_time_in_ms = cached_compile_time
+
+        # At this point, ``base_model`` is an uncompiled model ready for quantization,
+        # and ``input_data`` is the corresponding input tensor.  The baseline time
+        # has been stored in ``result.baseline_inference_time_in_ms``.
+
+        # Copy base model for quantizing/sparsifying
+        m_copy = deepcopy(base_model)
+
+        # Determine quantization/sparsity configuration
         ao_base_config = string_to_config(
             config.quantization,
             config.sparsity,
@@ -101,24 +178,39 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
             m_copy = m_copy.eval().to(config.device)
             quantize_(m_copy, ao_base_config)
 
-        if config.use_torch_compile:
-            print("Compiling quantized model....")
-            m_copy = torch.compile(
-                m_copy, mode=config.torch_compile_mode, fullgraph=True
-            )
-
         # Store result in model for memory profiling
         m_copy._benchmark_result = result
 
-        # Benchmark time to run an inference call for quantized model
-        print("Benchmarking quantized model.....")
-        result.model_inference_time_in_ms = model_inference_time_in_ms(
+        # Measure inference time for quantized model
+        print("Benchmarking eager quantized model.....")
+        result.quantized_model_eager_inference_time_in_ms = model_inference_time_in_ms(
             model=m_copy, input_data=input_data
         )
 
-        # Calculate speedup w.r.t. baseline
-        result.speedup = round(
-            result.baseline_inference_time_in_ms / result.model_inference_time_in_ms, 2
+        # Measure inference time for compiled quantized model
+        print("Benchmarking quantized model.....")
+        m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
+        result.quantized_model_compiled_inference_time_in_ms = (
+            model_inference_time_in_ms(model=m_copy, input_data=input_data)
+        )
+
+        # Compute eager speedup relative to baseline
+        result.eager_speedup_on_baseline = round(
+            result.baseline_model_eager_inference_time_in_ms
+            / result.quantized_model_eager_inference_time_in_ms,
+            ndigits=2,
+        )
+        # Compute compile speedup relative to baseline
+        result.compile_speedup_on_baseline = round(
+            result.baseline_model_compiled_inference_time_in_ms
+            / result.quantized_model_compiled_inference_time_in_ms,
+            ndigits=2,
+        )
+        # Compute compile speedup for quantized model relative to eager quantized model
+        result.compile_speedup_on_eager = round(
+            result.quantized_model_eager_inference_time_in_ms
+            / result.quantized_model_compiled_inference_time_in_ms,
+            ndigits=2,
         )
 
         # Run profiler if enabled
@@ -165,9 +257,9 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
                         result.memory_profile_path
                     )
             except ValueError as e:
-                if "not enough values to unpack" in e:
+                if "not enough values to unpack" in str(e):
                     print(
-                        "Failed due to existing bugs, re-run the code to generate memory profile. Please raise an issue if it persists."
+                        "Failed due to existing bugs, re‑run the code to generate memory profile. Please raise an issue if it persists."
                     )
             except Exception as e:
                 print(f"Error running memory profiler: {e}")
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -139,9 +139,6 @@ def get_quantization_sparsity_recipes(
     """
     config_recipes = set()
 
-    # Always include baseline without sparsity
-    config_recipes.add(("baseline", None))
-
     # Add all quantization techniques without sparsity
     for quant_config in quantization_recipes:
         config_recipes.add((quant_config, None))
diff --git a/benchmarks/microbenchmarks/test/benchmark_config.yml b/benchmarks/microbenchmarks/test/benchmark_config.yml
@@ -13,7 +13,6 @@ model_params:
         min_power: 14
         max_power: 16
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
@@ -27,7 +26,6 @@ model_params:
           [2048, 4096, 1024],
         ]
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "ln_linear_sigmoid"
@@ -41,7 +39,6 @@ model_params:
           [2048, 4096, 1024],  # For transformer_block, k is the hidden dimension
         ]
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition)
@@ -58,7 +55,6 @@ model_params:
         min_power: 10  # 1024
         max_power: 11  # 2048
     high_precision_dtype: "torch.bfloat16"
-    use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -21,7 +21,6 @@ def setUp(self):
             sparsity="semi-sparse",
             params={
                 "high_precision_dtype": "torch.float32",
-                "use_torch_compile": False,
                 "device": "cpu",
                 "model_type": "linear",
             },
@@ -46,7 +45,9 @@ def test_run_inference(self, mock_string_to_config):
 
         result = run(self.config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+        self.assertTrue(
+            hasattr(result, "quantized_model_compiled_inference_time_in_ms")
+        )
 
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
     def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
@@ -64,7 +65,6 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
             sparsity="semi-sparse",
             params={
                 "high_precision_dtype": "torch.float32",
-                "use_torch_compile": False,
                 "device": "cpu",
                 "model_type": "linear",
             },
@@ -75,7 +75,9 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
         )
         result = run(config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+        self.assertTrue(
+            hasattr(result, "quantized_model_compiled_inference_time_in_ms")
+        )
 
     @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
     def test_run_inference_with_block_sparsity(self, mock_string_to_config):
@@ -92,7 +94,6 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config):
             sparsity="block",
             params={
                 "high_precision_dtype": "torch.float32",
-                "use_torch_compile": False,
                 "device": "cpu",
                 "model_type": "linear",
             },
@@ -103,7 +104,9 @@ def test_run_inference_with_block_sparsity(self, mock_string_to_config):
         )
         result = run(config)
         self.assertIsInstance(result, BenchmarkResult)
-        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+        self.assertTrue(
+            hasattr(result, "quantized_model_compiled_inference_time_in_ms")
+        )
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_profiler.py b/benchmarks/microbenchmarks/test/test_benchmark_profiler.py
@@ -270,13 +270,12 @@ def test_memory_profiler_cuda_unavailable(self):
                 f"{config.name}_{self.m}_{self.k}_{self.n}_memory_profile.json",
             )
 
-            # Generate memory profile
-            result, memory_stats = generate_memory_profile(
-                self.model, self.input_data, memory_profile_path
-            )
-
             # Should return None when CUDA is unavailable
-            self.assertIsNone(result)
+            self.assertIsNone(
+                generate_memory_profile(
+                    self.model, self.input_data, memory_profile_path
+                )
+            )
 
             # Should not create file when CUDA is unavailable
             self.assertFalse(os.path.exists(memory_profile_path))
diff --git a/benchmarks/microbenchmarks/test/test_benchmark_runner.py b/benchmarks/microbenchmarks/test/test_benchmark_runner.py
@@ -39,7 +39,6 @@ def setUp(self):
                         }
                     ],
                     "high_precision_dtype": "torch.bfloat16",
-                    "use_torch_compile": True,
                     "torch_compile_mode": "max-autotune",
                     "device": "cpu",
                     "model_type": "linear",
@@ -130,7 +129,6 @@ def test_get_param_combinations(self):
         self.assertEqual(len(shapes), 1)
         self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024]))
         self.assertEqual(params["high_precision_dtype"], "torch.bfloat16")
-        self.assertEqual(params["use_torch_compile"], True)
 
     @patch("argparse.Namespace")
     def test_load_benchmark_configs(self, mock_args):
diff --git a/benchmarks/microbenchmarks/test/test_utils.py b/benchmarks/microbenchmarks/test/test_utils.py
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
diff --git a/docs/source/benchmarking_api_guide.md b/docs/source/benchmarking_api_guide.md