Skip to content

Commit

Permalink
agent and ext cgroups scenario (#2866)
Browse files Browse the repository at this point in the history
* agent-cgroups scenario

* address comments

* address comments

* fix-pylint

* pylint warn

* address comments

* improved logging"

* improved ext cgroups scenario

* new changes

* pylint fix

* updated

* address comments

* pylint warn

* address comment

* merge conflicts
  • Loading branch information
nagworld9 authored Aug 22, 2023
1 parent 14f6124 commit a2977b8
Show file tree
Hide file tree
Showing 12 changed files with 710 additions and 2 deletions.
2 changes: 1 addition & 1 deletion tests_e2e/orchestrator/runbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ variable:
#
# The test suites to execute
- name: test_suites
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_update, agent_status, multi_config_ext"
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_update, agent_status, multi_config_ext, agent_cgroups, ext_cgroups"
- name: cloud
value: "AzureCloud"
is_case_visible: true
Expand Down
7 changes: 7 additions & 0 deletions tests_e2e/test_suites/agent_cgroups.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#
# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics.
#
name: "AgentCgroups"
tests:
- "agent_cgroups/agent_cgroups.py"
images: "cgroups-endorsed"
10 changes: 10 additions & 0 deletions tests_e2e/test_suites/ext_cgroups.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#
# The test suite installs the few extensions and
# verify those extensions are running in expected cgroups and also, checks agent tracking those cgroups for polling resource metrics.
#
name: "ExtCgroups"
tests:
- "ext_cgroups/ext_cgroups.py"
images: "cgroups-endorsed"
# The DCR test extension installs sample service, so this test suite uses it to test services cgroups but this is only published in southcentralus region in public cloud.
locations: "AzureCloud:southcentralus"
6 changes: 6 additions & 0 deletions tests_e2e/test_suites/images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ image-sets:
- "rhel_90_arm64"
- "ubuntu_2204_arm64"

# As of today agent only support and enabled resource governance feature on following distros
cgroups-endorsed:
- "ubuntu_1604"
- "ubuntu_1804"
- "ubuntu_2004"

#
# An image can be specified by a string giving its urn, as in
#
Expand Down
40 changes: 40 additions & 0 deletions tests_e2e/tests/agent_cgroups/agent_cgroups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from tests_e2e.tests.lib.agent_test import AgentTest
from tests_e2e.tests.lib.agent_test_context import AgentTestContext
from tests_e2e.tests.lib.logging import log


class AgentCgroups(AgentTest):
"""
This test verifies that the agent is running in the expected cgroups.
"""

def __init__(self, context: AgentTestContext):
super().__init__(context)
self._ssh_client = self._context.create_ssh_client()

def run(self):
log.info("=====Validating agent cgroups=====")
self._run_remote_test("agent_cgroups-check_cgroups_agent.py")
log.info("Successfully Verified that agent present in correct cgroups")


if __name__ == "__main__":
AgentCgroups.run_from_command_line()
43 changes: 43 additions & 0 deletions tests_e2e/tests/ext_cgroups/ext_cgroups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from tests_e2e.tests.ext_cgroups.install_extensions import InstallExtensions
from tests_e2e.tests.lib.agent_test import AgentTest
from tests_e2e.tests.lib.agent_test_context import AgentTestContext
from tests_e2e.tests.lib.logging import log


class ExtCgroups(AgentTest):
"""
This test verifies the installed extensions assigned correctly in their cgroups.
"""

def __init__(self, context: AgentTestContext):
super().__init__(context)
self._ssh_client = self._context.create_ssh_client()

def run(self):
log.info("=====Installing extensions to validate ext cgroups scenario")
InstallExtensions(self._context).run()
log.info("=====Executing remote script check_cgroups_extensions.py to validate extension cgroups")
self._run_remote_test("ext_cgroups-check_cgroups_extensions.py", use_sudo=True)
log.info("Successfully verified that extensions present in correct cgroup")


if __name__ == "__main__":
ExtCgroups.run_from_command_line()
112 changes: 112 additions & 0 deletions tests_e2e/tests/ext_cgroups/install_extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from datetime import datetime, timedelta
from pathlib import Path

from tests_e2e.tests.lib.agent_test_context import AgentTestContext
from tests_e2e.tests.lib.identifiers import VmExtensionIds
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.virtual_machine_extension_client import VirtualMachineExtensionClient


class InstallExtensions:
"""
This test installs the multiple extensions in order to verify extensions cgroups in the next test.
"""

def __init__(self, context: AgentTestContext):
self._context = context
self._ssh_client = self._context.create_ssh_client()

def run(self):
self._prepare_agent()
# Install the GATest extension to test service cgroups
self._install_gatest_extension()
# Install the Azure Monitor Agent to test long running process cgroup
self._install_ama()
# Install the VM Access extension to test sample extension
self._install_vmaccess()
# Install the CSE extension to test extension cgroup
self._install_cse()

def _prepare_agent(self):
log.info("=====Executing update-waagent-conf remote script to update monitoring deadline flag for tracking azuremonitoragent service")
future_date = datetime.utcnow() + timedelta(days=2)
expiry_time = future_date.date().strftime("%Y-%m-%d")
# Agent needs extension info and it's services info in the handlermanifest.xml to monitor and limit the resource usage.
# As part of pilot testing , agent hardcoded azuremonitoragent service name to monitor it for sometime in production without need of manifest update from extesnion side.
# So that they can get sense of resource usage for their extensions. This we did for few months and now we no logner monitoring it in production.
# But we are changing the config flag expiry time to future date in this test. So that test agent will start track the cgroups that is used by the service.
result = self._ssh_client.run_command(f"update-waagent-conf Debug.CgroupMonitorExpiryTime={expiry_time}", use_sudo=True)
log.info(result)
log.info("Updated agent cgroups config(CgroupMonitorExpiryTime)")

def _install_ama(self):
ama_extension = VirtualMachineExtensionClient(
self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent,
resource_name="AMAAgent")
log.info("Installing %s", ama_extension)
ama_extension.enable()
ama_extension.assert_instance_view()

def _install_vmaccess(self):
# fetch the public key
public_key_file: Path = Path(self._context.private_key_file).with_suffix(".pub")
with public_key_file.open() as f:
public_key = f.read()
# Invoke the extension
vm_access = VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.VmAccess, resource_name="VmAccess")
log.info("Installing %s", vm_access)
vm_access.enable(
protected_settings={
'username': self._context.username,
'ssh_key': public_key,
'reset_ssh': 'false'
}
)
vm_access.assert_instance_view()

def _install_gatest_extension(self):
gatest_extension = VirtualMachineExtensionClient(
self._context.vm, VmExtensionIds.GATestExtension,
resource_name="GATestExt")
log.info("Installing %s", gatest_extension)
gatest_extension.enable()
gatest_extension.assert_instance_view()


def _install_cse(self):
# Use custom script to output the cgroups assigned to it at runtime and save to /var/lib/waagent/tmp/custom_script_check.
script_contents = """
mkdir /var/lib/waagent/tmp
cp /proc/$$/cgroup /var/lib/waagent/tmp/custom_script_check
"""
custom_script_2_0 = VirtualMachineExtensionClient(
self._context.vm,
VmExtensionIds.CustomScript,
resource_name="CustomScript")

log.info("Installing %s", custom_script_2_0)
custom_script_2_0.enable(
protected_settings={
'commandToExecute': f"echo \'{script_contents}\' | bash"
}
)
custom_script_2_0.assert_instance_view()

149 changes: 149 additions & 0 deletions tests_e2e/tests/lib/cgroup_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import os
import re

from assertpy import assert_that, fail

from azurelinuxagent.common.osutil import systemd
from azurelinuxagent.common.utils import shellutil
from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION
from tests_e2e.tests.lib.agent_log import AgentLog
from tests_e2e.tests.lib.logging import log

BASE_CGROUP = '/sys/fs/cgroup'
AGENT_CGROUP_NAME = 'WALinuxAgent'
AGENT_SERVICE_NAME = systemd.get_agent_unit_name()
AGENT_CONTROLLERS = ['cpu', 'memory']
EXT_CONTROLLERS = ['cpu', 'memory']

CGROUP_TRACKED_PATTERN = re.compile(r'Started tracking cgroup ([^\s]+)\s+\[(?P<path>[^\s]+)\]')

GATESTEXT_FULL_NAME = "Microsoft.Azure.Extensions.Edp.GATestExtGo"
GATESTEXT_SERVICE = "gatestext.service"
AZUREMONITOREXT_FULL_NAME = "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent"
AZUREMONITORAGENT_SERVICE = "azuremonitoragent.service"
MDSD_SERVICE = "mdsd.service"


def verify_if_distro_supports_cgroup():
"""
checks if agent is running in a distro that supports cgroups
"""
log.info("===== Checking if distro supports cgroups")

base_cgroup_fs_exists = os.path.exists(BASE_CGROUP)

assert_that(base_cgroup_fs_exists).is_true().described_as("Cgroup file system:{0} not found in Distro {1}-{2}".format(BASE_CGROUP, DISTRO_NAME, DISTRO_VERSION))

log.info('Distro %s-%s supports cgroups\n', DISTRO_NAME, DISTRO_VERSION)


def print_cgroups():
"""
log the mounted cgroups information
"""
log.info("====== Currently mounted cgroups ======")
for m in shellutil.run_command(['mount']).splitlines():
# output is similar to
# mount
# sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,relatime,seclabel)
# proc on /proc type proc (rw,nosuid,nodev,noexec,relatime)
# devtmpfs on /dev type devtmpfs (rw,nosuid,seclabel,size=1842988k,nr_inodes=460747,mode=755)
# cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd)
# cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,pids)
# cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,memory)
# cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,blkio)
# cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,hugetlb)
if 'type cgroup' in m:
log.info('\t%s', m)


def print_service_status():
log.info("====== Agent Service status ======")
output = shellutil.run_command(["systemctl", "status", systemd.get_agent_unit_name()])
for line in output.splitlines():
log.info("\t%s", line)


def get_agent_cgroup_mount_path():
return os.path.join('/', 'azure.slice', AGENT_SERVICE_NAME)


def get_extension_cgroup_mount_path(extension_name):
return os.path.join('/', 'azure.slice/azure-vmextensions.slice',
"azure-vmextensions-" + extension_name + ".slice")


def get_unit_cgroup_mount_path(unit_name):
"""
Returns the cgroup mount path for the given unit
"""
output = shellutil.run_command(["systemctl", "show", unit_name, "--property", "ControlGroup"])
# Output is similar to
# systemctl show walinuxagent.service --property ControlGroup
# ControlGroup=/azure.slice/walinuxagent.service
# matches above output and extract right side value
match = re.match("[^=]+=(?P<value>.+)", output)
if match is not None:
return match.group('value')
return None


def verify_agent_cgroup_assigned_correctly():
"""
This method checks agent is running and assigned to the correct cgroup using service status output
"""
log.info("===== Verifying the daemon and the agent are assigned to the same correct cgroup using systemd")
service_status = shellutil.run_command(["systemctl", "status", systemd.get_agent_unit_name()])
log.info("Agent service status output:\n%s", service_status)
is_active = False
is_cgroup_assigned = False
cgroup_mount_path = get_agent_cgroup_mount_path()
is_active_pattern = re.compile(r".*Active:\s+active.*")

for line in service_status.splitlines():
if re.match(is_active_pattern, line):
is_active = True
elif cgroup_mount_path in line:
is_cgroup_assigned = True

if not is_active:
fail('walinuxagent service was not active/running. Service status:{0}'.format(service_status))
if not is_cgroup_assigned:
fail('walinuxagent service was not assigned to the expected cgroup:{0}'.format(cgroup_mount_path))

log.info("Successfully verified the agent cgroup assigned correctly by systemd\n")


def get_agent_cpu_quota():
"""
Returns the cpu quota for the agent service
"""
output = shellutil.run_command(["systemctl", "show", AGENT_SERVICE_NAME, "--property", "CPUQuotaPerSecUSec"])
# Output is similar to
# systemctl show walinuxagent --property CPUQuotaPerSecUSec
# CPUQuotaPerSecUSec=infinity
match = re.match("[^=]+=(?P<value>.+)", output)
if match is not None:
return match.group('value')
return None


def check_agent_quota_disabled():
"""
Returns True if the cpu quota is infinity
"""
cpu_quota = get_agent_cpu_quota()
return cpu_quota == 'infinity'


def check_cgroup_disabled_with_unknown_process():
"""
Returns True if the cgroup is disabled with unknown process
"""
for record in AgentLog().read():
match = re.search("Disabling resource usage monitoring. Reason: Check on cgroups failed:.+UNKNOWN",
record.message, flags=re.DOTALL)
if match is not None:
log.info("Found message:\n\t%s", record.text.replace("\n", "\n\t"))
return True
return False
2 changes: 2 additions & 0 deletions tests_e2e/tests/lib/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,5 @@ class VmExtensionIds(object):
RunCommandHandler: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.CPlat.Core', ext_type='RunCommandHandlerLinux', version="1.0")
VmAccess: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.OSTCExtensions', ext_type='VMAccessForLinux', version="1.0")
GuestAgentDcrTestExtension: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.Azure.TestExtensions.Edp', ext_type='GuestAgentDcrTest', version='1.0')
AzureMonitorLinuxAgent: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.Azure.Monitor', ext_type='AzureMonitorLinuxAgent', version="1.5")
GATestExtension: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.Azure.Extensions.Edp', ext_type='GATestExtGo', version="1.2")
Loading

0 comments on commit a2977b8

Please sign in to comment.