Skip to content

Commit

Permalink
cgroup rewrite
Browse files Browse the repository at this point in the history
  • Loading branch information
nagworld9 committed Dec 3, 2024
1 parent 3e2a70d commit fa8216a
Show file tree
Hide file tree
Showing 21 changed files with 586 additions and 522 deletions.
36 changes: 36 additions & 0 deletions azurelinuxagent/common/osutil/systemd.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,39 @@ def get_unit_property(unit_name, property_name):
raise ValueError("Can't find property {0} of {1}".format(property_name, unit_name))
return match.group('value')


def set_unit_property_run_time(unit_name, property_name, value):
"""
Set a property of a unit at runtime
Note: --runtime settings only apply until the next reboot
"""
try:
# Ex: systemctl set-property foobar.service CPUWeight=200 --runtime
shellutil.run_command(["systemctl", "set-property", unit_name, "{0}={1}".format(property_name, value), "--runtime"])
except shellutil.CommandError as e:
raise ValueError("Can't set property {0} of {1}: {2}".format(property_name, unit_name, e))


def set_unit_properties_run_time(unit_name, properties):
"""
Set multiple properties of a unit at runtime
Note: --runtime settings only apply until the next reboot
"""
try:
# Ex: systemctl set-property foobar.service CPUWeight=200 MemoryMax=2G IPAccounting=yes --runtime
shellutil.run_command(["systemctl", "set-property", unit_name] + properties + ["--runtime"])
except shellutil.CommandError as e:
raise ValueError("Can't set properties {0} of {1}: {2}".format(properties, unit_name, e))


def is_unit_loaded(unit_name):
"""
Determine if a unit is loaded
"""
try:
value = get_unit_property(unit_name, "LoadState")
return value.lower() == "loaded"
except shellutil.CommandError:
return False
7 changes: 2 additions & 5 deletions azurelinuxagent/ga/cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class CGroupUtil(object):
Cgroup utility methods which are independent of systemd cgroup api.
"""
@staticmethod
def cgroups_supported():
def distro_supported():
distro_info = get_distro()
distro_name = distro_info[0]
try:
Expand Down Expand Up @@ -410,6 +410,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh
for controller in cgroup.get_controllers():
if isinstance(controller, _CpuController):
cpu_controller = controller
log_cgroup_info('{0}'.format(controller))
CGroupsTelemetry.track_cgroup_controller(controller)

except IOError as e:
Expand Down Expand Up @@ -650,8 +651,6 @@ def get_controllers(self, expected_relative_path=None):
controller = MemoryControllerV1(self._cgroup_name, controller_path)

if controller is not None:
msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller)
log_cgroup_info(msg)
controllers.append(controller)

return controllers
Expand Down Expand Up @@ -729,8 +728,6 @@ def get_controllers(self, expected_relative_path=None):
controller = MemoryControllerV2(self._cgroup_name, self._cgroup_path)

if controller is not None:
msg = "{0} controller for cgroup: {1}".format(supported_controller_name, controller)
log_cgroup_info(msg)
controllers.append(controller)

return controllers
Expand Down
556 changes: 288 additions & 268 deletions azurelinuxagent/ga/cgroupconfigurator.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion azurelinuxagent/ga/cgroupcontroller.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def is_active(self):
' Internal error: {1}'.format(self.path, ustr(e)))
return False

def get_tracked_metrics(self, **_):
def get_tracked_metrics(self):
"""
Retrieves the current value of the metrics tracked for this controller/cgroup and returns them as an array.
"""
Expand Down
14 changes: 2 additions & 12 deletions azurelinuxagent/ga/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,8 @@ class CGroupsTelemetry(object):
"""
"""
_tracked = {}
_track_throttled_time = False
_rlock = threading.RLock()

@staticmethod
def set_track_throttled_time(value):
CGroupsTelemetry._track_throttled_time = value

@staticmethod
def get_track_throttled_time():
return CGroupsTelemetry._track_throttled_time

@staticmethod
def track_cgroup_controller(cgroup_controller):
"""
Expand Down Expand Up @@ -79,7 +70,7 @@ def poll_all_tracked():
with CGroupsTelemetry._rlock:
for controller in CGroupsTelemetry._tracked.values():
try:
metrics.extend(controller.get_tracked_metrics(track_throttled_time=CGroupsTelemetry._track_throttled_time))
metrics.extend(controller.get_tracked_metrics())
except Exception as e:
# There can be scenarios when the CGroup has been deleted by the time we are fetching the values
# from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
Expand All @@ -98,5 +89,4 @@ def poll_all_tracked():
@staticmethod
def reset():
with CGroupsTelemetry._rlock:
CGroupsTelemetry._tracked.clear() # emptying the dictionary
CGroupsTelemetry._track_throttled_time = False
CGroupsTelemetry._tracked.clear() # emptying the dictionary
2 changes: 1 addition & 1 deletion azurelinuxagent/ga/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def get_max_recorded_metrics(self):
def _poll_resource_usage(self):
metrics = []
for controller in self.controllers:
metrics.extend(controller.get_tracked_metrics(track_throttled_time=True))
metrics.extend(controller.get_tracked_metrics())

for metric in metrics:
current_max = self.max_recorded_metrics.get(metric.counter)
Expand Down
9 changes: 4 additions & 5 deletions azurelinuxagent/ga/cpucontroller.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,18 +108,17 @@ def get_cpu_throttled_time(self, read_previous_throttled_time=True):
"""
raise NotImplementedError()

def get_tracked_metrics(self, **kwargs):
def get_tracked_metrics(self):
# Note: If the current cpu usage is less than the previous usage (metric is negative), then an empty array will
# be returned and the agent won't track the metrics.
tracked = []
cpu_usage = self.get_cpu_usage()
if cpu_usage >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))

if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']:
throttled_time = self.get_cpu_throttled_time()
if cpu_usage >= float(0) and throttled_time >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))
throttled_time = self.get_cpu_throttled_time()
if cpu_usage >= float(0) and throttled_time >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))

return tracked

Expand Down
14 changes: 7 additions & 7 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1337,11 +1337,10 @@ def set_extension_resource_limits(self):
# setup the resource limits for extension operations and it's services.
man = self.load_manifest()
resource_limits = man.get_resource_limits()
if not CGroupConfigurator.get_instance().is_extension_resource_limits_setup_completed(extension_name,
cpu_quota=resource_limits.get_extension_slice_cpu_quota()):
CGroupConfigurator.get_instance().setup_extension_slice(
extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota())
CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list())

CGroupConfigurator.get_instance().setup_extension_slice(
extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota())
CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list())

def create_status_file_if_not_exist(self, extension, status, code, operation, message):
_, status_path = self.get_status_file_path(extension)
Expand Down Expand Up @@ -1476,7 +1475,7 @@ def uninstall(self, extension=None):
resource_limits = man.get_resource_limits()
CGroupConfigurator.get_instance().stop_tracking_extension_services_cgroups(
resource_limits.get_service_list())
CGroupConfigurator.get_instance().remove_extension_services_drop_in_files(
CGroupConfigurator.get_instance().reset_extension_services_quota(
resource_limits.get_service_list())

uninstall_cmd = man.get_uninstall_command()
Expand All @@ -1502,8 +1501,9 @@ def on_rmtree_error(_, __, exc_info):

shutil.rmtree(base_dir, onerror=on_rmtree_error)

CGroupConfigurator.get_instance().stop_tracking_extension_cgroups(self.get_full_name())
self.logger.info("Remove the extension slice: {0}".format(self.get_full_name()))
CGroupConfigurator.get_instance().remove_extension_slice(
CGroupConfigurator.get_instance().reset_extension_quota(
extension_name=self.get_full_name())

except IOError as e:
Expand Down
2 changes: 1 addition & 1 deletion tests/ga/test_cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_an

for (distro, supported) in test_cases:
with patch("azurelinuxagent.ga.cgroupapi.get_distro", return_value=distro):
self.assertEqual(CGroupUtil.cgroups_supported(), supported, "cgroups_supported() failed on {0}".format(distro))
self.assertEqual(CGroupUtil.distro_supported(), supported, "cgroups_supported() failed on {0}".format(distro))


class SystemdCgroupsApiTestCase(AgentTestCase):
Expand Down
Loading

0 comments on commit fa8216a

Please sign in to comment.