Skip to content

Commit

Permalink
Merge pull request #95 from Snailed/fix/dram
Browse files Browse the repository at this point in the history
Improved logging of RAPL permission issues
  • Loading branch information
Snailed authored Feb 6, 2025
2 parents f7c5848 + e537a96 commit f819b5d
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 39 deletions.
24 changes: 14 additions & 10 deletions carbontracker/components/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
)
from carbontracker.components.handler import Handler
from typing import Iterable, List, Union, Type, Sized
from carbontracker.loggerutil import Logger
import os

COMPONENTS = [
{
Expand Down Expand Up @@ -43,7 +45,7 @@ def handlers_by_name(name) -> List[Type[Handler]]:


class Component:
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger):
self.name = name
if name not in component_names():
raise exceptions.ComponentNameError(
Expand All @@ -54,6 +56,7 @@ def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
)
self.power_usages: List[List[float]] = []
self.cur_epoch: int = -1 # Sentry
self.logger = logger

@property
def handler(self) -> Handler:
Expand Down Expand Up @@ -97,18 +100,19 @@ def collect_power_usage(self, epoch: int):
self.power_usages.append([])
try:
self.power_usages[-1] += self.handler.power_usage()
except exceptions.IntelRaplPermissionError:
except exceptions.IntelRaplPermissionError as e:
energy_paths = " and ".join(e.file_names)
commands = ["sudo chmod +r " + energy_path for energy_path in e.file_names]
# Only raise error if no measurements have been collected.
if not self.power_usages[-1]:
print(
"No sudo access to read Intel's RAPL measurements from the energy_uj file."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/40"
)
self.logger.err_critical(
r"Could not read CPU/DRAM energy consumption due to lack of read-permissions.\n\tPlease run the following command(s): \n\t\t" + r"\n\t\t".join(commands)
)
# Append zero measurement to avoid further errors.
self.power_usages.append([0])
except exceptions.GPUPowerUsageRetrievalError:
if not self.power_usages[-1]:
print(
self.logger.err_critical(
"GPU model does not support retrieval of power usages in NVML."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/36"
)
Expand Down Expand Up @@ -154,16 +158,16 @@ def shutdown(self):


def create_components(
components: str, pids: Iterable[int], devices_by_pid: bool
components: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger
) -> List[Component]:
components = components.strip().replace(" ", "").lower()
if components == "all":
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in component_names()
]
else:
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in components.split(",")
]
6 changes: 4 additions & 2 deletions carbontracker/components/cpu/intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ def _read_energy(self, path: str) -> int:

def _get_measurements(self):
measurements = []
permission_errors = []
for package in self._rapl_devices:
try:
power_usage = self._read_energy(os.path.join(RAPL_DIR, package))
measurements.append(power_usage)
# If there is no sudo access, we cannot read the energy_uj file.
# Permission denied error is raised.
except PermissionError:
raise exceptions.IntelRaplPermissionError()
permission_errors += [os.path.join(RAPL_DIR, package, "energy_uj")]

except FileNotFoundError:
# check cpu/gpu/dram
Expand All @@ -79,7 +80,8 @@ def _get_measurements(self):
)

measurements.append(total_power_usage)

if permission_errors:
raise exceptions.IntelRaplPermissionError(permission_errors)
return measurements

def _convert_rapl_name(self, package, name, pattern) -> Union[None, str]:
Expand Down
5 changes: 4 additions & 1 deletion carbontracker/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

class NoComponentsAvailableError(Exception):
def __init__(
self,
Expand All @@ -23,7 +25,8 @@ def __init__(self, expected_unit, received_unit, message):
class IntelRaplPermissionError(Exception):
"""Raised when an Intel RAPL permission error occurs."""

pass
def __init__(self, file_names: List[str]):
self.file_names = file_names


class GPUPowerUsageRetrievalError(Exception):
Expand Down
2 changes: 1 addition & 1 deletion carbontracker/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def __init__(
self.tracker = CarbonTrackerThread(
delete=self._delete,
components=component.create_components(
components=components, pids=pids, devices_by_pid=devices_by_pid
components=components, pids=pids, devices_by_pid=devices_by_pid, logger=self.logger
),
logger=self.logger,
ignore_errors=ignore_errors,
Expand Down
4 changes: 2 additions & 2 deletions tests/components/test_intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_available(self, mock_listdir, mock_exists):
mock_exists.return_value = True
mock_listdir.return_value = ["some_directory"]

component = Component(name='cpu', pids=[], devices_by_pid={})
component = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertTrue(component.available())

@patch("os.path.exists")
Expand All @@ -35,7 +35,7 @@ def test_available_false(self, mock_available, mock_listdir, mock_exists):
mock_exists.return_value = False
mock_listdir.return_value = []

cpu = Component(name='cpu', pids=[], devices_by_pid={})
cpu = Component(name='cpu', pids=[], devices_by_pid={}, logger=None)
self.assertFalse(cpu.available())

@patch("time.sleep")
Expand Down
44 changes: 22 additions & 22 deletions tests/test_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,23 @@ class TestComponent(unittest.TestCase):
def test_init_valid_component(
self, mock_handlers_by_name, mock_error_by_name, mock_component_names
):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
self.assertEqual(component.name, "gpu")
self.assertEqual(component._handler, mock_handlers_by_name()[0]())

def test_init_invalid_component(self):
with self.assertRaises(exceptions.ComponentNameError):
Component(name="unknown", pids=[], devices_by_pid=False)
Component(name="unknown", pids=[], devices_by_pid=False, logger=None)

def test_devices(self):
handler_mock = MagicMock(devices=MagicMock(return_value=["Test GPU"]))
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
self.assertEqual(component.devices(), ["Test GPU"])

def test_available_true(self):
handler_mock = MagicMock(available=MagicMock(return_value=True))
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
self.assertTrue(component.available())

Expand All @@ -52,33 +52,33 @@ def test_available_true(self):
return_value=False,
)
def test_available_false(self, mock_apple_gpu_available, mock_nvidia_gpu_available):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
self.assertFalse(component.available())

def test_collect_power_usage_no_measurement(self):
handler_mock = MagicMock(
power_usage=MagicMock(side_effect=exceptions.IntelRaplPermissionError)
power_usage=MagicMock(side_effect=exceptions.IntelRaplPermissionError(file_names=["file1", "file2"]))
)
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=MagicMock(err_critical=MagicMock()))
component._handler = handler_mock
component.collect_power_usage(epoch=1)
self.assertEqual(component.power_usages, [[], [0]])

def test_collect_power_usage_with_measurement(self):
handler_mock = MagicMock(power_usage=MagicMock(return_value=[1000]))
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
component.collect_power_usage(epoch=1)
self.assertEqual(component.power_usages, [[1000]])

def test_collect_power_usage_with_measurement_but_no_epoch(self):
power_collector = Component(name="cpu", pids=[], devices_by_pid=False)
power_collector = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
power_collector._handler = MagicMock(power_usage=MagicMock(return_value=[1000]))
power_collector.collect_power_usage(epoch=0)
assert len(power_collector.power_usages) == 0

def test_collect_power_usage_with_previous_measurement(self):
power_collector = Component(name="cpu", pids=[], devices_by_pid=False)
power_collector = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
power_collector._handler = MagicMock(power_usage=MagicMock(return_value=[1000]))
power_collector.collect_power_usage(epoch=1)
power_collector.collect_power_usage(epoch=3)
Expand All @@ -88,13 +88,13 @@ def test_collect_power_usage_GPUPowerUsageRetrievalError(self):
handler_mock = MagicMock(
power_usage=MagicMock(side_effect=exceptions.GPUPowerUsageRetrievalError)
)
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=MagicMock(err_critical=MagicMock()))
component._handler = handler_mock
component.collect_power_usage(epoch=1)
self.assertEqual(component.power_usages, [[], [0]])

def test_energy_usage(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[1000], [2000], [3000]]
epoch_times = [1, 2, 3]
energy_usages = component.energy_usage(epoch_times)
Expand All @@ -104,14 +104,14 @@ def test_energy_usage(self):
self.assertTrue(np.all(np.array(energy_usages) > 0))

def test_energy_usage_no_measurements(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[]]
epoch_times = [1]
energy_usages = component.energy_usage(epoch_times)
self.assertEqual(energy_usages, [0])

def test_energy_usage_with_power_from_later_epoch(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[1000], [2000], [3000]]
epoch_times = [1, 2, 3, 4]
energy_usages = component.energy_usage(epoch_times)
Expand All @@ -121,7 +121,7 @@ def test_energy_usage_with_power_from_later_epoch(self):
)

def test_energy_usage_no_power(self):
component = Component(name="cpu", pids=[], devices_by_pid=False)
component = Component(name="cpu", pids=[], devices_by_pid=False, logger=None)
component.power_usages = [[], [], [], [], []]
epoch_times = [1, 2, 3, 4, 5]
energy_usages = component.energy_usage(epoch_times)
Expand All @@ -132,7 +132,7 @@ def test_energy_usage_no_power(self):

def test_init(self):
handler_mock = MagicMock()
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
component.init()
handler_mock.init.assert_called_once()
Expand All @@ -144,15 +144,15 @@ def test_init(self):

def test_shutdown(self):
handler_mock = MagicMock()
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = handler_mock
component.shutdown()
handler_mock.shutdown.assert_called_once()

def test_create_components(self):
gpu = create_components("gpu", pids=[], devices_by_pid=False)
cpu = create_components("cpu", pids=[], devices_by_pid=False)
all_components = create_components("all", pids=[], devices_by_pid=False)
gpu = create_components("gpu", pids=[], devices_by_pid=False, logger=None)
cpu = create_components("cpu", pids=[], devices_by_pid=False, logger=None)
all_components = create_components("all", pids=[], devices_by_pid=False, logger=None)
self.assertEqual(len(gpu), 1)
self.assertEqual(len(cpu), 1)
self.assertEqual(len(all_components), 2)
Expand All @@ -166,12 +166,12 @@ def test_error_by_name(self):
)

def test_handler_property_with_handler_set(self):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = "test"
self.assertEqual(component.handler, "test")

def test_handler_property_without_handler(self):
component = Component(name="gpu", pids=[], devices_by_pid=False)
component = Component(name="gpu", pids=[], devices_by_pid=False, logger=None)
component._handler = None
with self.assertRaises(exceptions.GPUError):
component.handler()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_unit_error(self):

def test_intel_rapl_permission_error(self):
with self.assertRaises(exceptions.IntelRaplPermissionError):
raise exceptions.IntelRaplPermissionError
raise exceptions.IntelRaplPermissionError(file_names=["file1", "file2"])

def test_gpu_power_usage_retrieval_error(self):
with self.assertRaises(exceptions.GPUPowerUsageRetrievalError):
Expand Down

0 comments on commit f819b5d

Please sign in to comment.