Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
nagworld9 committed Dec 1, 2024
1 parent aab7e4f commit 5388684
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 110 deletions.
82 changes: 19 additions & 63 deletions azurelinuxagent/ga/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,9 @@ def disable(self, reason, disable_cgroups):
self._reset_cpu_quota(systemd.get_agent_unit_name())
extension_services = self.get_extension_services_list()
for extension in extension_services:
log_cgroup_info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension]), send_event=False)
self._reset_extension_cpu_quota(extension_name=extension)
self._reset_extension_services_cpu_quota(extension_services[extension])
log_cgroup_info("Resetting extension : {0} and it's services: {1} Quota".format(extension, extension_services[extension]), send_event=False)
self.reset_extension_quota(extension_name=extension)
self.reset_extension_services_quota(extension_services[extension])
CGroupsTelemetry.reset()
self._agent_cgroups_enabled = False
self._extensions_cgroups_enabled = False
Expand Down Expand Up @@ -577,7 +577,7 @@ def _check_processes_in_agent_cgroup(self, check_before_enable=False):
Raises a CGroupsException only when current unexpected process seen last time.
Note: This check added as conservative approach before cgroups feature stable. Now this producing noise due to race issues, we identify extra process before systemd move to new cgroup or process about to die.
So changing the behavior to report only when we see the unexpected process last time. Later we will remove this check if no issues reported.
So changing the behavior to report only when we see the same unexpected process last time. Later we will remove this check if no issues reported.
"""
current_unexpected = {}
agent_cgroup_proc_names = []
Expand Down Expand Up @@ -615,6 +615,7 @@ def _check_processes_in_agent_cgroup(self, check_before_enable=False):
# If so, consider it as valid process in agent cgroup.
if current == 0 and not (self._is_process_descendant_of_the_agent(process) or self._is_zombie_process(process)):
current_unexpected[process] = self._format_process(process)
# old behavior when check before enable cgroups
if check_before_enable:
report = [current_unexpected[process] for process in current_unexpected]
else:
Expand Down Expand Up @@ -785,7 +786,7 @@ def stop_tracking_unit_cgroups(self, unit_name):
controllers = cgroup.get_controllers()

for controller in controllers:
CGroupsTelemetry.stop_tracking(controller)
CGroupsTelemetry.stop_tracking(controller)

except Exception as exception:
log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False)
Expand Down Expand Up @@ -834,18 +835,8 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh
process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509
return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code)

def _reset_extension_cpu_quota(self, extension_name):
"""
Removes any CPUQuota on the extension
NOTE: This resets the quota on the extension's slice; any local overrides on the VM will take precedence
over this setting.
"""
if self.enabled():
self._reset_cpu_quota(CGroupUtil.get_extension_slice_name(extension_name))

@staticmethod
def get_unit_properties_requiring_update(unit_name, cpu_quota=""):
def _get_unit_properties_requiring_update(unit_name, cpu_quota=""):
"""
Check if the cgroups setup is completed for the unit and return the properties that need an update.
"""
Expand Down Expand Up @@ -879,16 +870,16 @@ def setup_extension_slice(self, extension_name, cpu_quota):
extension_slice_path = os.path.join(unit_file_install_path, extension_slice)
CGroupConfigurator._Impl._cleanup_unit_file(extension_slice_path)

# clean up the old-old slice(includes version in the name) from the disk
old_extension_slice_path = os.path.join(unit_file_install_path,
CGroupUtil.get_extension_slice_name(extension_name,
old_slice=True))
# clean up the old slice(includes version in the name) from the disk
if os.path.exists(old_extension_slice_path):
CGroupConfigurator._Impl._cleanup_unit_file(old_extension_slice_path)

cpu_quota = "{0}%".format(
cpu_quota) if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
properties_to_update = self.get_unit_properties_requiring_update(extension_slice, cpu_quota)
properties_to_update = self._get_unit_properties_requiring_update(extension_slice, cpu_quota)

if len(properties_to_update) > 0:
if cpu_quota == "":
Expand All @@ -903,19 +894,15 @@ def setup_extension_slice(self, extension_name, cpu_quota):
log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}".format(extension_slice,
ustr(exception)))

def remove_extension_slice(self, extension_name):
def reset_extension_quota(self, extension_name):
"""
This method ensures that the extension slice gets removed from /lib/systemd/system if it exist
Lastly stop the unit. This would ensure the cleanup the /sys/fs/cgroup controller paths
Removes any CPUQuota on the extension
NOTE: This resets the quota on the extension's slice; any local overrides on the VM will take precedence
over this setting.
"""
if self.enabled():
unit_file_install_path = systemd.get_unit_file_install_path()
extension_slice_name = CGroupUtil.get_extension_slice_name(extension_name)
extension_slice_path = os.path.join(unit_file_install_path, extension_slice_name)
if os.path.exists(extension_slice_path):
self.stop_tracking_extension_cgroups(extension_name)
CGroupConfigurator._Impl._cleanup_unit_file(extension_slice_path)
self._reset_extension_cpu_quota(extension_name)
self._reset_cpu_quota(CGroupUtil.get_extension_slice_name(extension_name))

def set_extension_services_cpu_memory_quota(self, services_list):
"""
Expand Down Expand Up @@ -944,7 +931,7 @@ def set_extension_services_cpu_memory_quota(self, services_list):

cpu_quota = service.get('cpuQuotaPercentage')
cpu_quota = "{0}%".format(cpu_quota) if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
properties_to_update = self.get_unit_properties_requiring_update(service_name, cpu_quota)
properties_to_update = self._get_unit_properties_requiring_update(service_name, cpu_quota)
if systemd.is_unit_loaded(service_name) and len(properties_to_update) > 0:
if cpu_quota != "":
log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}".format(service_name, cpu_quota))
Expand All @@ -956,7 +943,7 @@ def set_extension_services_cpu_memory_quota(self, services_list):
except Exception as exception:
log_cgroup_warning("Failed to set the quotas for {0}: {1}".format(service_name, ustr(exception)))

def _reset_extension_services_cpu_quota(self, services_list):
def reset_extension_services_quota(self, services_list):
"""
Removes any CPUQuota on the extension service
Expand All @@ -968,41 +955,10 @@ def _reset_extension_services_cpu_quota(self, services_list):
try:
for service in services_list:
service_name = service.get('name', None)
unit_file_path = systemd.get_unit_file_install_path()
if service_name is not None and unit_file_path is not None:
drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
self._cleanup_unit_file(drop_in_file_cpu_quota)
if service_name is not None and systemd.is_unit_loaded(service_name):
self._reset_cpu_quota(service_name)
except Exception as exception:
log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}'.format(service_name, ustr(exception)))

def remove_extension_services_drop_in_files(self, services_list):
"""
Remove the dropin files from service .d folder for the given service and reset the quotas
"""
if services_list is not None:
for service in services_list:
service_name = service.get('name', None)
unit_file_path = systemd.get_unit_file_install_path()
if service_name is not None and unit_file_path is not None:
files_to_cleanup = []
drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name))
drop_in_file_cpu_accounting = os.path.join(drop_in_path,
_DROP_IN_FILE_CPU_ACCOUNTING)
files_to_cleanup.append(drop_in_file_cpu_accounting)
drop_in_file_memory_accounting = os.path.join(drop_in_path,
_DROP_IN_FILE_MEMORY_ACCOUNTING)
files_to_cleanup.append(drop_in_file_memory_accounting)
cpu_quota = service.get('cpuQuotaPercentage', None)
if cpu_quota is not None:
drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA)
files_to_cleanup.append(drop_in_file_cpu_quota)

CGroupConfigurator._Impl._cleanup_all_files(files_to_cleanup)
log_cgroup_info("Drop in files removed for {0}".format(service_name))

self._reset_extension_services_cpu_quota(services_list)
log_cgroup_warning('Failed to reset for {0} : {1}'.format(service_name, ustr(exception)))

def stop_tracking_extension_services_cgroups(self, services_list):
"""
Expand Down
5 changes: 3 additions & 2 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1475,7 +1475,7 @@ def uninstall(self, extension=None):
resource_limits = man.get_resource_limits()
CGroupConfigurator.get_instance().stop_tracking_extension_services_cgroups(
resource_limits.get_service_list())
CGroupConfigurator.get_instance().remove_extension_services_drop_in_files(
CGroupConfigurator.get_instance().reset_extension_services_quota(
resource_limits.get_service_list())

uninstall_cmd = man.get_uninstall_command()
Expand All @@ -1501,8 +1501,9 @@ def on_rmtree_error(_, __, exc_info):

shutil.rmtree(base_dir, onerror=on_rmtree_error)

CGroupConfigurator.get_instance().stop_tracking_extension_cgroups(self.get_full_name())
self.logger.info("Remove the extension slice: {0}".format(self.get_full_name()))
CGroupConfigurator.get_instance().remove_extension_slice(
CGroupConfigurator.get_instance().reset_extension_quota(
extension_name=self.get_full_name())

except IOError as e:
Expand Down
54 changes: 17 additions & 37 deletions tests/ga/test_cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,31 +250,17 @@ def test_setup_extension_slice_should_create_unit_files(self):
self.assertFalse(os.path.exists(extension_slice_unit_file), "{0} was not created".format(extension_slice_unit_file))


def test_remove_extension_slice_should_remove_unit_files(self):
with self._get_cgroup_configurator() as configurator:
with patch("os.path.exists") as mock_path:
mock_path.return_value = True
# get the paths to the mocked files
extension_slice_unit_file = configurator.mocks.get_mapped_path(UnitFilePaths.extensionslice)

CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/' \
'azure-vmextensions-Microsoft.CPlat.Extension.slice'] = \
CpuControllerV1('Microsoft.CPlat.Extension',
'/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.CPlat.Extension.slice')

def test_reset_extension_quota(self):
command_mocks = [MockCommand(r"^systemctl show (.+) --property CPUQuotaPerSecUSec",
'''CPUQuotaPerSecUSec=5ms
''')]
with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator:
extension_name = "Microsoft.CPlat.Extension"
configurator.remove_extension_slice(extension_name=extension_name)
configurator.reset_extension_quota(extension_name=extension_name)

tracked = CGroupsTelemetry._tracked

self.assertFalse(
any(cg for cg in tracked.values() if cg.name == 'Microsoft.CPlat.Extension' and 'cpu' in cg.path),
"The extension's CPU is being tracked")
self.assertFalse(
any(cg for cg in tracked.values() if cg.name == 'Microsoft.CPlat.Extension' and 'memory' in cg.path),
"The extension's Memory is being tracked")

self.assertFalse(os.path.exists(extension_slice_unit_file), "{0} should not be present".format(extension_slice_unit_file))
command = 'systemctl set-property azure-vmextensions-{0}.slice CPUQuota= --runtime'.format(
extension_name)
self.assertIn(command, configurator.mocks.commands_call_list, "The command to reset the CPU quota was not called")

def test_enable_should_raise_cgroups_exception_when_cgroups_are_not_supported(self):
with self._get_cgroup_configurator(enable=False) as configurator:
Expand Down Expand Up @@ -686,26 +672,20 @@ def test_it_should_stop_tracking_extension_services_cgroups(self):
any(cg for cg in tracked.values() if cg.name == 'extension.service' and 'memory' in cg.path),
"The extension service's Memory is being tracked")

def test_it_should_remove_extension_services_drop_in_files(self):
def test_it_should_reset_extension_services_quota(self):
command_mocks = [MockCommand(r"^systemctl show extension\.service --property CPUQuotaPerSecUSec",
'''CPUQuotaPerSecUSec=5ms
''')]
service_list = [
{
"name": "extension.service",
"cpuQuotaPercentage": 5
}
]
with self._get_cgroup_configurator() as configurator:
extension_service_cpu_accounting = configurator.mocks.get_mapped_path(
UnitFilePaths.extension_service_cpu_accounting)
extension_service_cpu_quota = configurator.mocks.get_mapped_path(UnitFilePaths.extension_service_cpu_quota)
extension_service_memory_accounting = configurator.mocks.get_mapped_path(
UnitFilePaths.extension_service_memory_accounting)
configurator.remove_extension_services_drop_in_files(service_list)
self.assertFalse(os.path.exists(extension_service_cpu_accounting),
"{0} should not have been created".format(extension_service_cpu_accounting))
self.assertFalse(os.path.exists(extension_service_cpu_quota),
"{0} should not have been created".format(extension_service_cpu_quota))
self.assertFalse(os.path.exists(extension_service_memory_accounting),
"{0} should not have been created".format(extension_service_memory_accounting))
with self._get_cgroup_configurator(mock_commands=command_mocks) as configurator:
configurator.reset_extension_services_quota(service_list)
cmd = 'systemctl set-property extension.service CPUQuota= --runtime'
self.assertIn(cmd, configurator.mocks.commands_call_list, "The command to set the reset CPU quota was not called")

def test_it_should_start_tracking_unit_cgroups(self):

Expand Down
8 changes: 7 additions & 1 deletion tests_e2e/tests/agent_cgroups/agent_cgroups_process_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from typing import List, Dict, Any

from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION
from tests_e2e.tests.lib.agent_test import AgentVmTest
from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext
from tests_e2e.tests.lib.logging import log
Expand Down Expand Up @@ -68,7 +69,12 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
# [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 2957] dd\x00if=/dev/zero\x00of=/dev/null\x00 ']
# 2024-04-01T19:17:04.995276Z WARNING ExtHandler ExtHandler [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 3285] /usr/bin/python3\x00/var/lib/waagent/Microsoft.Azure.Monitor.AzureM', '[PID: 3286] /usr/bin/python3\x00/var/lib/waagent/Microsoft.Azure.Monitor.AzureM']
{'message': r"The agent's cgroup includes unexpected processes"},
{'message': r"Found unexpected processes in the agent cgroup before agent enable cgroups"}
{'message': r"Found unexpected processes in the agent cgroup before agent enable cgroups"},

# Ubuntu 16 has an issue representing no quota as infinity, instead it outputs weird values. https://github.com/systemd/systemd/issues/5965, so ignoring in ubuntu 16
# 2024-11-26T00:07:38.716162Z INFO ExtHandler ExtHandler [CGW] Error parsing current CPUQuotaPerSecUSec: could not convert string to float: '584542y 2w 2d 20h 1min 49.549568'
{'message': r"Error parsing current CPUQuotaPerSecUSec: could not convert string to float", 'if': lambda r: DISTRO_NAME == 'ubuntu' and DISTRO_VERSION == '16.04'}

]
return ignore_rules

Expand Down
Loading

0 comments on commit 5388684

Please sign in to comment.