Skip to content

Commit

Permalink
Abstraction layer over NVIDIA and AMD GPUs (#46)
Browse files Browse the repository at this point in the history
Co-authored-by: Jae-Won Chung <jwnchung@umich.edu>
  • Loading branch information
parthraut and jaywonchung authored Apr 1, 2024
1 parent ddb3be7 commit 9c6b3b0
Show file tree
Hide file tree
Showing 17 changed files with 1,066 additions and 224 deletions.
10 changes: 5 additions & 5 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ theme:
plugins:
- search
- autorefs
- social:
cards_dir: assets/img/social
cards_layout_options:
background_color: "#f7e96d"
color: "#231f20"
# - social:
# cards_dir: assets/img/social
# cards_layout_options:
# background_color: "#f7e96d"
# color: "#231f20"
- gen-files:
scripts:
- docs/gen_ref_pages.py
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ pydocstyle.convention = "google"
"zeus/optimizer/perseus/common.py" = ["N805"]
"zeus/optimizer/perseus/server/router.py" = ["B008"]
"zeus/util/pydantic_v1.py" = ["F403"]
"zeus/device/gpu.py" = ["N802", "N803"]

[tool.pytest.ini_options]
addopts = "--numprocesses auto"
2 changes: 1 addition & 1 deletion tests/optimizer/test_power_limit_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_power_limit_optimizer(
tmp_path: Path,
):
# Mock PyNVML.
pynvml_mock = mocker.patch("zeus.optimizer.power_limit.pynvml", autospec=True)
pynvml_mock = mocker.patch("zeus.device.gpu.pynvml", autospec=True)
pynvml_mock.nvmlDeviceGetHandleByIndex.side_effect = lambda i: f"handle{i}"
pynvml_mock.nvmlDeviceGetPowerManagementLimitConstraints.side_effect = \
lambda _: (min(replay_log.power_limits) * 1000, max(replay_log.power_limits) * 1000)
Expand Down
19 changes: 12 additions & 7 deletions tests/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from zeus.monitor import Measurement, ZeusMonitor
from zeus.util.testing import ReplayZeusMonitor
import zeus.device.gpu

if TYPE_CHECKING:
from pathlib import Path
Expand All @@ -37,19 +38,23 @@
pynvml.NVML_DEVICE_ARCH_AMPERE,
]

@pytest.fixture(autouse=True, scope="function")
def reset_gpus() -> None:
"""Reset the global variable `_gpus` to None on every test."""
zeus.device.gpu._gpus = None


@pytest.fixture
def pynvml_mock(mocker: MockerFixture):
"""Mock the entire pynvml module."""
mock = mocker.patch("zeus.monitor.energy.pynvml", autospec=True)
# Mock the pynvml import in the gpu module.
mock = mocker.patch("zeus.device.gpu.pynvml", autospec=True)

# Except for the arch constants.
mock.NVML_DEVICE_ARCH_PASCAL = pynvml.NVML_DEVICE_ARCH_PASCAL
mock.NVML_DEVICE_ARCH_VOLTA = pynvml.NVML_DEVICE_ARCH_VOLTA
mock.NVML_DEVICE_ARCH_AMPERE = pynvml.NVML_DEVICE_ARCH_AMPERE

mocker.patch("zeus.util.env.pynvml", mock)

return mock


Expand Down Expand Up @@ -108,10 +113,9 @@ def mock_gpus(request, mocker: MockerFixture, pynvml_mock: MagicMock) -> tuple[t

def mock_pynvml(nvml_indices: list[int], archs: list[int]) -> None:
assert len(nvml_indices) == len(archs)
index_to_handle = {i: f"handle{i}" for i in nvml_indices}
handle_to_arch = {f"handle{i}": arch for i, arch in zip(nvml_indices, archs)}
pynvml_mock.nvmlDeviceGetCount.return_value = NUM_GPUS
pynvml_mock.nvmlDeviceGetHandleByIndex.side_effect = lambda index: index_to_handle[index]
pynvml_mock.nvmlDeviceGetHandleByIndex.side_effect = lambda index: f"handle{index}" # GPU Monitoring object grabs all handles visible to system on initialization.
pynvml_mock.nvmlDeviceGetArchitecture.side_effect = lambda handle: handle_to_arch[handle]

if cuda_visible_devices is None: # All GPUs are visible to PyTorch.
Expand Down Expand Up @@ -155,8 +159,6 @@ def test_monitor(pynvml_mock, mock_gpus, mocker: MockerFixture, tmp_path: Path):
is_old_torch = {index: arch < pynvml.NVML_DEVICE_ARCH_VOLTA for index, arch in zip(torch_gpu_indices, gpu_archs)}
old_gpu_torch_indices = [index for index, is_old in is_old_torch.items() if is_old]

mocker.patch("zeus.monitor.energy.atexit.register")

class MockPowerMonitor:
def __init__(self, gpu_indices: list[int] | None, update_period: float | None) -> None:
assert gpu_indices == old_gpu_torch_indices
Expand All @@ -177,6 +179,9 @@ def get_energy(self, start: float, end: float) -> dict[int, float]:

log_file = tmp_path / "log.csv"

# want to make zeus.device.gpu.nvml_is_available is a function, want it to always return true when testing
mocker.patch("zeus.device.gpu.nvml_is_available", return_value=True)

########################################
# Test ZeusMonitor initialization.
########################################
Expand Down
1 change: 1 addition & 0 deletions zeus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
- [`analyze`][zeus.analyze]: Functions for analyzing log files.
- [`job`][zeus.job]: Job specification.
- [`simulate`][zeus.simulate]: Machinery for trace-driven Zeus.
- [`device`][zeus.device]: Abstraction of compute devices.
- [`policy`][zeus.policy]: Collection of optimization policies.
- [`util`][zeus.util]: Utility functions and classes.
"""
Expand Down
11 changes: 3 additions & 8 deletions zeus/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@

from __future__ import annotations

import pynvml

from zeus.callback import Callback
from zeus.monitor import ZeusMonitor
from zeus.util.metric import zeus_cost
from zeus.util.logging import get_logger
from zeus.device import get_gpus


class EarlyStopController(Callback):
Expand Down Expand Up @@ -80,17 +79,13 @@ def __init__(
self.logger = get_logger(type(self).__name__)

# Cache NVML device handles if they're needed.
self.gpu_handles = {}
self.max_power = {}
if self.cost_threshold is not None:
assert self.monitor is not None
pynvml.nvmlInit()
gpus = get_gpus()
for gpu_index in self.monitor.gpu_indices:
device = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
self.gpu_handles[gpu_index] = device
self.max_power[gpu_index] = (
pynvml.nvmlDeviceGetPowerManagementLimitConstraints(device)[1]
// 1000
gpus.getPowerManagementLimitConstraints(gpu_index)[1] // 1000
)

# States.
Expand Down
80 changes: 80 additions & 0 deletions zeus/device/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""GPU device module for Zeus. Abstraction of GPU devices.
The main function of this module is [`get_gpus`][zeus.device.gpu.get_gpus], which returns a GPU Manager object specific to the platform.
To instantiate a GPU Manager object, you can do the following:
```python
from zeus.device import get_gpus
gpus = get_gpus() # Returns NVIDIAGPUs() or AMDGPUs() depending on the platform.
```
There exists a 1:1 mapping between specific library functions and methods implemented in the GPU Manager object.
For example, for NVIDIA systems, if you wanted to do:
```python
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(handle)
```
You can now do:
```python
gpus = get_gpus() # returns a NVIDIAGPUs object
constraints = gpus.getPowerManagementLimitConstraints(gpu_index)
```
Class hierarchy:
- [`GPUs`][zeus.device.gpu.GPUs]: Abstract class for GPU managers.
- [`NVIDIAGPUs`][zeus.device.gpu.NVIDIAGPUs]: GPU manager for NVIDIA GPUs, initialize NVIDIAGPU objects.
- [`AMDGPUs`][zeus.device.gpu.AMDGPUs]: GPU manager for AMD GPUs, initialize AMDGPU objects.
- [`GPU`][zeus.device.gpu.GPU]: Abstract class for GPU objects.
- [`NVIDIAGPU`][zeus.device.gpu.NVIDIAGPU]: GPU object for NVIDIA GPUs.
- [`AMDGPU`][zeus.device.gpu.AMDGPU]: GPU object for AMD GPUs.
The following exceptions are defined in this module:
- [`ZeusGPUInitError`][zeus.device.gpu.ZeusGPUInitError]: Base class for initialization errors.
- [`ZeusGPUInvalidArgError`][zeus.device.gpu.ZeusGPUInvalidArgError]: Error for invalid arguments.
- [`ZeusGPUNotSupportedError`][zeus.device.gpu.ZeusGPUNotSupportedError]: Error for unsupported GPUs.
- [`ZeusGPUNoPermissionError`][zeus.device.gpu.ZeusGPUNoPermissionError]: Error for permission issues.
- [`ZeusGPUAlreadyInitializedError`][zeus.device.gpu.ZeusGPUAlreadyInitializedError]: Error for reinitialization.
- [`ZeusGPUNotFoundError`][zeus.device.gpu.ZeusGPUNotFoundError]: Error for missing GPUs.
- [`ZeusGPUInsufficientSizeError`][zeus.device.gpu.ZeusGPUInsufficientSizeError]: Error for insufficient buffer size.
- [`ZeusGPUInsufficientPowerError`][zeus.device.gpu.ZeusGPUInsufficientPowerError]: Error for insufficient power.
- [`ZeusGPUDriverNotLoadedError`][zeus.device.gpu.ZeusGPUDriverNotLoadedError]: Error for driver issues.
- [`ZeusGPUTimeoutError`][zeus.device.gpu.ZeusGPUTimeoutError]: Error for timeout issues.
- [`ZeusGPUIRQError`][zeus.device.gpu.ZeusGPUIRQError]: Error for IRQ issues.
- [`ZeusGPULibraryNotFoundError`][zeus.device.gpu.ZeusGPULibraryNotFoundError]: Error for missing libraries.
- [`ZeusGPUFunctionNotFoundError`][zeus.device.gpu.ZeusGPUFunctionNotFoundError]: Error for missing functions.
- [`ZeusGPUCorruptedInfoROMError`][zeus.device.gpu.ZeusGPUCorruptedInfoROMError]: Error for corrupted info ROM.
- [`ZeusGPULostError`][zeus.device.gpu.ZeusGPULostError]: Error for lost GPUs.
- [`ZeusGPUResetRequiredError`][zeus.device.gpu.ZeusGPUResetRequiredError]: Error for GPUs requiring reset.
- [`ZeusGPUOperatingSystemError`][zeus.device.gpu.ZeusGPUOperatingSystemError]: Error for OS issues.
- [`ZeusGPULibRMVersionMismatchError`][zeus.device.gpu.ZeusGPULibRMVersionMismatchError]: Error for library version mismatch.
- [`ZeusGPUMemoryError`][zeus.device.gpu.ZeusGPUMemoryError]: Error for memory issues.
- [`ZeusGPUUnknownError`][zeus.device.gpu.ZeusGPUUnknownError]: Error for unknown issues.
"""
from zeus.device.gpu import (
get_gpus,
ZeusGPUInitError,
ZeusGPUInvalidArgError,
ZeusGPUNotSupportedError,
ZeusGPUNoPermissionError,
ZeusGPUAlreadyInitializedError,
ZeusGPUNotFoundError,
ZeusGPUInsufficientSizeError,
ZeusGPUInsufficientPowerError,
ZeusGPUDriverNotLoadedError,
ZeusGPUTimeoutError,
ZeusGPUIRQError,
ZeusGPULibraryNotFoundError,
ZeusGPUFunctionNotFoundError,
ZeusGPUCorruptedInfoROMError,
ZeusGPULostError,
ZeusGPUResetRequiredError,
ZeusGPUOperatingSystemError,
ZeusGPULibRMVersionMismatchError,
ZeusGPUMemoryError,
ZeusGPUUnknownError,
)
10 changes: 10 additions & 0 deletions zeus/device/exception.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Base Zeus GPU Exception Class."""
from zeus.exception import ZeusBaseError


class ZeusBaseGPUError(ZeusBaseError):
"""Zeus base GPU exception class."""

def __init__(self, message: str) -> None:
"""Initialize Base Zeus Exception."""
super().__init__(message)
Loading

0 comments on commit 9c6b3b0

Please sign in to comment.