Skip to content

Commit

Permalink
improved error message for RAPL permission issues
Browse files Browse the repository at this point in the history
  • Loading branch information
Snailed committed Feb 6, 2025
1 parent 1bc0854 commit 335efe3
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 14 deletions.
24 changes: 14 additions & 10 deletions carbontracker/components/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
)
from carbontracker.components.handler import Handler
from typing import Iterable, List, Union, Type, Sized
from carbontracker.loggerutil import Logger
import os

COMPONENTS = [
{
Expand Down Expand Up @@ -43,7 +45,7 @@ def handlers_by_name(name) -> List[Type[Handler]]:


class Component:
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger):
self.name = name
if name not in component_names():
raise exceptions.ComponentNameError(
Expand All @@ -54,6 +56,7 @@ def __init__(self, name: str, pids: Iterable[int], devices_by_pid: bool):
)
self.power_usages: List[List[float]] = []
self.cur_epoch: int = -1 # Sentry
self.logger = logger

@property
def handler(self) -> Handler:
Expand Down Expand Up @@ -97,18 +100,19 @@ def collect_power_usage(self, epoch: int):
self.power_usages.append([])
try:
self.power_usages[-1] += self.handler.power_usage()
except exceptions.IntelRaplPermissionError:
except exceptions.IntelRaplPermissionError as e:
energy_paths = " and ".join(e.file_names)
commands = ["sudo chmod +r " + energy_path for energy_path in e.file_names]
# Only raise error if no measurements have been collected.
if not self.power_usages[-1]:
print(
"No sudo access to read Intel's RAPL measurements from the energy_uj file."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/40"
)
self.logger.err_critical(
f"Could not read CPU/DRAM energy consumption due to lack of read-permissions.\n\tPlease run the following command(s): \n\t\t{"\n\t\t".join(commands)}"
)
# Append zero measurement to avoid further errors.
self.power_usages.append([0])
except exceptions.GPUPowerUsageRetrievalError:
if not self.power_usages[-1]:
print(
self.logger.err_critical(
"GPU model does not support retrieval of power usages in NVML."
"\nSee issue: https://github.com/lfwa/carbontracker/issues/36"
)
Expand Down Expand Up @@ -154,16 +158,16 @@ def shutdown(self):


def create_components(
components: str, pids: Iterable[int], devices_by_pid: bool
components: str, pids: Iterable[int], devices_by_pid: bool, logger: Logger
) -> List[Component]:
components = components.strip().replace(" ", "").lower()
if components == "all":
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in component_names()
]
else:
return [
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid)
Component(name=comp_name, pids=pids, devices_by_pid=devices_by_pid, logger=logger)
for comp_name in components.split(",")
]
6 changes: 4 additions & 2 deletions carbontracker/components/cpu/intel.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ def _read_energy(self, path: str) -> int:

def _get_measurements(self):
measurements = []
permission_errors = []
for package in self._rapl_devices:
try:
power_usage = self._read_energy(os.path.join(RAPL_DIR, package))
measurements.append(power_usage)
# If there is no sudo access, we cannot read the energy_uj file.
# Permission denied error is raised.
except PermissionError:
raise exceptions.IntelRaplPermissionError()
permission_errors += [os.path.join(RAPL_DIR, package, "energy_uj")]

except FileNotFoundError:
# check cpu/gpu/dram
Expand All @@ -79,7 +80,8 @@ def _get_measurements(self):
)

measurements.append(total_power_usage)

if permission_errors:
raise exceptions.IntelRaplPermissionError(permission_errors)
return measurements

def _convert_rapl_name(self, package, name, pattern) -> Union[None, str]:
Expand Down
3 changes: 2 additions & 1 deletion carbontracker/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def __init__(self, expected_unit, received_unit, message):
class IntelRaplPermissionError(Exception):
"""Raised when an Intel RAPL permission error occurs."""

pass
def __init__(self, file_names: list[str]):
self.file_names = file_names


class GPUPowerUsageRetrievalError(Exception):
Expand Down
2 changes: 1 addition & 1 deletion carbontracker/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def __init__(
self.tracker = CarbonTrackerThread(
delete=self._delete,
components=component.create_components(
components=components, pids=pids, devices_by_pid=devices_by_pid
components=components, pids=pids, devices_by_pid=devices_by_pid, logger=self.logger
),
logger=self.logger,
ignore_errors=ignore_errors,
Expand Down

0 comments on commit 335efe3

Please sign in to comment.