Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Perf Profiler Update #690

Merged
merged 33 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
67ff543
add device_spec
jeromeku Jul 7, 2024
dfc7f8c
add performance counter
jeromeku Jul 7, 2024
0ac59f7
add more perf counter tools
jeromeku Jul 7, 2024
bf77aa8
add performance counter manager test
jeromeku Jul 7, 2024
e570b00
add mbu and mfu test
jeromeku Jul 7, 2024
8e766b6
refactor performance manager device spec
jeromeku Jul 7, 2024
def8685
add perf stats
jeromeku Jul 8, 2024
cd02f4f
start perf counter manager test refactor
jeromeku Jul 8, 2024
ce21131
add stat print str
jeromeku Jul 8, 2024
92c4f58
refactor performance counter with perf stats
jeromeku Jul 8, 2024
016404b
more perf stats tests
jeromeku Jul 8, 2024
deae11f
add perf stat print formatting tests
jeromeku Jul 8, 2024
9cf1200
fix device spec formatting
jeromeku Jul 8, 2024
19a6a70
finish perf counter manager refactor
jeromeku Jul 8, 2024
22b1cab
add serialization test
jeromeku Jul 9, 2024
cc3c73c
refactor stats tests
jeromeku Jul 9, 2024
0d2885a
refactor remaining tests
jeromeku Jul 9, 2024
7fc4c1e
clean up tests
jeromeku Jul 9, 2024
0363e1b
clean up device_spec tests
jeromeku Jul 9, 2024
06f0b08
add latency
jeromeku Jul 9, 2024
95c1c28
add latency tests
jeromeku Jul 9, 2024
09208c1
fix formatting
jeromeku Jul 10, 2024
4de130a
remove unused methods
jeromeku Jul 10, 2024
2afa14f
add documentation
jeromeku Jul 10, 2024
ff2d193
more docs
jeromeku Jul 10, 2024
ea0f2b6
formatting
jeromeku Jul 10, 2024
996538f
clean up warnings
jeromeku Jul 10, 2024
eb797c0
rename duration -> latency
jeromeku Jul 10, 2024
7b578ac
add gpt-fast example
jeromeku Jul 10, 2024
842215d
linting and formatting
jeromeku Jul 10, 2024
4968ddf
update profiler tutorial readme
jeromeku Jul 10, 2024
2b0b86f
move total_model_params to utils
jeromeku Jul 27, 2024
b39443a
remove tutorials/profiler
jeromeku Jul 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions test/profiler/test_device_spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pytest

cuda_driver = pytest.importorskip(
"triton.runtime.driver", reason="requires triton cuda driver module"
)
import itertools

import torch
from utils import patch_device

from torchao.profiler.device_spec import (
_AVAILABLE_GPU_SPECS,
CUDADeviceSpec,
get_chip_name,
)

# -------------------- Device Spec Tests ------------------- #
DEVICE_NAMES = ["h100 sxm", "a100", "nvidia geforce rtx 4090"]
DTYPES = [torch.float32, torch.bfloat16, torch.float16]
USE_TENSORCORES = [True, False]
DEVICE_CONFIGS = itertools.product(DEVICE_NAMES, DTYPES, USE_TENSORCORES)


@pytest.mark.parametrize(
"device_name, dtype, use_tensorcores", DEVICE_CONFIGS, ids=lambda x: str(x)
)
def test_device_spec(device_name, dtype, use_tensorcores):
with patch_device(device_name):
device_spec = CUDADeviceSpec(dtype=dtype, use_tensorcores=use_tensorcores)
if dtype == torch.float32 and use_tensorcores:
dtype = "tfloat32"
chip_name = get_chip_name(device_name)
expected_flops = _AVAILABLE_GPU_SPECS[chip_name][dtype]
assert device_spec.flops_per_s == expected_flops
assert device_spec.flops_by_dtype[dtype] == expected_flops
assert (
device_spec.roofline_balancepoint == expected_flops / device_spec.bandwidth
)

with pytest.raises(AssertionError):
device_spec.flops_per_s = None
print(device_spec.roofline_balancepoint)
# Prevent setting attributes not in named fields to guard against user error
with pytest.raises(AttributeError):
device_spec.FLOPs = None


def test_empty_device_spec():
device_name = "fake device"
with patch_device(device_name):
with pytest.raises(AssertionError):
_ = CUDADeviceSpec()

# Ok to instantiate as long as fields are filled
_ = CUDADeviceSpec(
name=device_name,
flops_per_s=1.0,
bandwidth=1.0,
dtype=torch.float32,
use_tensorcores=True,
)
device_name = DEVICE_NAMES[0]

with patch_device(device_name):
# All critical fields will be auto-filled except for dtype (and vram, but vram is not used for downstream calcs atm)
_ = CUDADeviceSpec(dtype=torch.float32)

# No dtype specified
with pytest.raises(AssertionError):
_ = CUDADeviceSpec()
Loading
Loading