Skip to content

Memory profiler for cuda #1993

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 89 additions & 67 deletions benchmarks/microbenchmarks/benchmark_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,89 +12,111 @@

from copy import deepcopy
from pathlib import Path
import os
import subprocess
import uuid

import torch
from torch.profiler import profile, record_function, ProfilerActivity

from benchmarks.microbenchmarks.utils import (
BenchmarkConfig,
BenchmarkResult,
clean_caches,
create_model_and_input,
generate_memory_profile,
generate_model_profile,
model_inference_time_in_ms,
string_to_config,
generate_model_profile,
)
from torchao.quantization import quantize_
from torchao.sparsity.sparse_api import sparsify_


def run(config: BenchmarkConfig) -> BenchmarkResult:
"""Run inference benchmarks"""
clean_caches() # Clean caches

# Create output directory if it doesn't exist
Path(config.output_dir).mkdir(parents=True, exist_ok=True)

base_model, input_data = create_model_and_input(
config.model_type,
config.m,
config.k,
config.n,
high_precision_dtype=config.high_precision_dtype,
device=config.device,
)

# Use quantize_ to apply each quantization function to the model
m_copy = deepcopy(base_model).eval().to(config.device)
ao_base_config = string_to_config(
config.quantization,
config.sparsity,
high_precision_dtype=config.high_precision_dtype,
)

# Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
is_cuda = config.device == "cuda" and torch.cuda.is_available()

if config.sparsity is not None and (
config.quantization is None or "baseline" in config.quantization
):
if is_cuda:
print(f"Applying {config.sparsity} sparsity to model")
sparsify_(m_copy, ao_base_config)
try:
clean_caches() # Clean caches

# Create output directory if it doesn't exist
Path(config.output_dir).mkdir(parents=True, exist_ok=True)

base_model, input_data = create_model_and_input(
config.model_type,
config.m,
config.k,
config.n,
high_precision_dtype=config.high_precision_dtype,
device=config.device,
)

# Use quantize_ to apply each quantization function to the model
m_copy = deepcopy(base_model).eval().to(config.device)
ao_base_config = string_to_config(
config.quantization,
config.sparsity,
high_precision_dtype=config.high_precision_dtype,
)

# Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
is_cuda = config.device == "cuda" and torch.cuda.is_available()

if config.sparsity is not None and (
config.quantization is None or "baseline" in config.quantization
):
if is_cuda:
print(f"Applying {config.sparsity} sparsity to model")
sparsify_(m_copy, ao_base_config)
else:
print(
f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
)
elif config.sparsity is None and (
config.quantization is None or "baseline" in config.quantization
):
pass # No quantization or sparsity specified, do nothing
else:
print(
f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
print("Quantizing model....")
quantize_(m_copy, ao_base_config)

if config.use_torch_compile:
print("Compiling model....")
m_copy = torch.compile(
m_copy, mode=config.torch_compile_mode, fullgraph=True
)
elif config.sparsity is None and (
config.quantization is None or "baseline" in config.quantization
):
pass # No quantization or sparsity specified, do nothing
else:
print("Quantizing model....")
quantize_(m_copy, ao_base_config)

if config.use_torch_compile:
print("Compiling model....")
m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)

# Run benchmarks
result = BenchmarkResult(config=config)

# Benchmark time to run an inference call for quantized model
result.model_inference_time_in_ms = model_inference_time_in_ms(
model=m_copy, input_data=input_data
)

# Run profiler if enabled
if config.enable_profiler:
print("Running profiler...")
try:
result.profiler_json_path, result.perfetto_url = generate_model_profile(m_copy, input_data, config.profiler_file_name)
except Exception as e:
print(f"Error running profiler: {e}")

return result

# Run benchmarks
result = BenchmarkResult(config=config)
# Store result in model for memory profiling
m_copy._benchmark_result = result

# Benchmark time to run an inference call for quantized model
result.model_inference_time_in_ms = model_inference_time_in_ms(
model=m_copy, input_data=input_data
)

# Run profiler if enabled
if config.enable_profiler:
print("Running profiler...")
try:
result.profiler_json_path, result.perfetto_url = generate_model_profile(
m_copy, input_data, config.profiler_file_name
)
except Exception as e:
print(f"Error running profiler: {e}")

# Run memory profiler if enabled
if config.enable_memory_profile:
print("Running memory profiler...")
try:
result.memory_profile_path, result.memory_stats = (
generate_memory_profile(
m_copy, input_data, config.memory_profile_file_name
)
)
except Exception as e:
print(f"Error running memory profiler: {e}")

return result
except Exception as e:
print(f"Error in benchmark run: {e}")
import traceback

print(traceback.format_exc())
return None
4 changes: 3 additions & 1 deletion benchmarks/microbenchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,11 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
)
result = run_inference(config) # Pass the config object directly
results.append(result)
if result is not None: # Only add successful results
results.append(result)
except Exception as e:
import traceback

print(f"Error running benchmark {config.name} with error: {e}")
print(traceback.format_exc())
continue
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/microbenchmarks/test/benchmark_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ model_params:
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "linear"
enable_profiler: true # Disable profiling for this model
enable_profiler: true # Enable profiling for this model
enable_memory_profile: true # Enable memory profiling for this model

# - name: "cpu_fp32_linear"
# matrix_shapes:
Expand Down
Loading
Loading