From 3c32d45177a44ca1fb83baffefc6b23780001575 Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Thu, 1 Feb 2024 09:47:31 -0800 Subject: [PATCH 1/4] Add config parameter to wait for cloud-init (Extensions.WaitForCloudInit) (#3031) * Add config parameter to wait for cloud-init (Extensions.WaitForCloudInit) --------- Co-authored-by: narrieta --- README.md | 24 +++++ azurelinuxagent/common/conf.py | 10 ++ azurelinuxagent/common/event.py | 1 + azurelinuxagent/common/utils/shellutil.py | 39 +++++++- azurelinuxagent/ga/update.py | 20 ++++ tests/common/test_conf.py | 2 + tests/common/utils/test_shell_util.py | 9 +- tests/ga/test_update.py | 61 ++++++++++++- tests/lib/mock_update_handler.py | 4 +- tests/test_agent.py | 2 + .../lib/agent_test_suite_combinator.py | 74 +++++++++++---- tests_e2e/orchestrator/runbook.yml | 2 +- .../test_suites/agent_wait_for_cloud_init.yml | 13 +++ .../disable_agent_provisioning.py | 43 ++------- .../add_cloud_init_script.py | 63 +++++++++++++ .../agent_wait_for_cloud_init.py | 91 +++++++++++++++++++ tests_e2e/tests/lib/update_arm_template.py | 83 +++++++++++++++++ 17 files changed, 476 insertions(+), 65 deletions(-) create mode 100644 tests_e2e/test_suites/agent_wait_for_cloud_init.yml create mode 100755 tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py create mode 100755 tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py diff --git a/README.md b/README.md index 3d3a824e1f..6d0296bfcc 100644 --- a/README.md +++ b/README.md @@ -261,6 +261,30 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set provisioning time, via whichever API is being used. We will provide more details on this on our wiki when it is generally available. +#### __Extensions.WaitForCloudInit__ + +_Type: Boolean_ +_Default: n_ + +Waits for cloud-init to complete (cloud-init status --wait) before executing VM extensions. + +Both cloud-init and VM extensions are common ways to customize a VM during initial deployment. By +default, the agent will start executing extensions while cloud-init may still be in the 'config' +stage and won't wait for the 'final' stage to complete. Cloud-init and extensions may execute operations +that conflict with each other (for example, both of them may try to install packages). Setting this option +to 'y' ensures that VM extensions are executed only after cloud-init has completed all its stages. + +Note that using this option requires creating a custom image with the value of this option set to 'y', in +order to ensure that the wait is performed during the initial deployment of the VM. + +#### __Extensions.WaitForCloudInitTimeout__ + +_Type: Integer_ +_Default: 3600_ + +Timeout in seconds for the Agent to wait on cloud-init. If the timeout elapses, the Agent will continue +executing VM extensions. See Extensions.WaitForCloudInit for more details. + #### __Extensions.GoalStatePeriod__ _Type: Integer_ diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py index 57d6c9d280..a13f333576 100644 --- a/azurelinuxagent/common/conf.py +++ b/azurelinuxagent/common/conf.py @@ -117,6 +117,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__): "Logs.Console": True, "Logs.Collect": True, "Extensions.Enabled": True, + "Extensions.WaitForCloudInit": False, "Provisioning.AllowResetSysUser": False, "Provisioning.RegenerateSshHostKeyPair": False, "Provisioning.DeleteRootPassword": False, @@ -170,6 +171,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__): __INTEGER_OPTIONS__ = { "Extensions.GoalStatePeriod": 6, "Extensions.InitialGoalStatePeriod": 6, + "Extensions.WaitForCloudInitTimeout": 3600, "OS.EnableFirewallPeriod": 300, "OS.RemovePersistentNetRulesPeriod": 30, "OS.RootDeviceScsiTimeoutPeriod": 30, @@ -372,6 +374,14 @@ def get_extensions_enabled(conf=__conf__): return conf.get_switch("Extensions.Enabled", True) +def get_wait_for_cloud_init(conf=__conf__): + return conf.get_switch("Extensions.WaitForCloudInit", False) + + +def get_wait_for_cloud_init_timeout(conf=__conf__): + return conf.get_switch("Extensions.WaitForCloudInitTimeout", 3600) + + def get_goal_state_period(conf=__conf__): return conf.get_int("Extensions.GoalStatePeriod", 6) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index fe313968fe..b010583808 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -75,6 +75,7 @@ class WALAEventOperation: CGroupsCleanUp = "CGroupsCleanUp" CGroupsDisabled = "CGroupsDisabled" CGroupsInfo = "CGroupsInfo" + CloudInit = "CloudInit" CollectEventErrors = "CollectEventErrors" CollectEventUnicodeErrors = "CollectEventUnicodeErrors" ConfigurationChange = "ConfigurationChange" diff --git a/azurelinuxagent/common/utils/shellutil.py b/azurelinuxagent/common/utils/shellutil.py index 50fd4592f1..d2bfd787ed 100644 --- a/azurelinuxagent/common/utils/shellutil.py +++ b/azurelinuxagent/common/utils/shellutil.py @@ -18,9 +18,17 @@ # import os import subprocess +import sys import tempfile import threading +if sys.version_info[0] == 2: + # TimeoutExpired was introduced on Python 3; define a dummy class for Python 2 + class TimeoutExpired(Exception): + pass +else: + from subprocess import TimeoutExpired + import azurelinuxagent.common.logger as logger from azurelinuxagent.common.future import ustr @@ -206,7 +214,7 @@ def __run_command(command_action, command, log_error, encode_output): # W0622: Redefining built-in 'input' -- disabled: the parameter name mimics subprocess.communicate() -def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True): # pylint:disable=W0622 +def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True, timeout=None): # pylint:disable=W0622 """ Executes the given command and returns its stdout. @@ -227,7 +235,9 @@ def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr= value for these parameters is anything other than the default (subprocess.PIPE)), then the corresponding values returned by this function or the CommandError exception will be empty strings. - Note: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function. + NOTE: The 'timeout' parameter is ignored on Python 2 + + NOTE: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function. """ if input is not None and stdin is not None: raise ValueError("The input and stdin arguments are mutually exclusive") @@ -246,7 +256,30 @@ def command_action(): else: process = subprocess.Popen(command, stdin=popen_stdin, stdout=stdout, stderr=stderr, shell=False) - command_stdout, command_stderr = process.communicate(input=communicate_input) + try: + if sys.version_info[0] == 2: # communicate() doesn't support timeout on Python 2 + command_stdout, command_stderr = process.communicate(input=communicate_input) + else: + command_stdout, command_stderr = process.communicate(input=communicate_input, timeout=timeout) + except TimeoutExpired: + if log_error: + logger.error(u"Command [{0}] timed out", __format_command(command)) + + command_stdout, command_stderr = '', '' + + try: + process.kill() + # try to get any output from the command, but ignore any errors if we can't + try: + command_stdout, command_stderr = process.communicate() + # W0702: No exception type(s) specified (bare-except) + except: # pylint: disable=W0702 + pass + except Exception as exception: + if log_error: + logger.error(u"Can't terminate timed out process: {0}", ustr(exception)) + raise CommandError(command=__format_command(command), return_code=-1, stdout=command_stdout, stderr="command timeout\n{0}".format(command_stderr)) + if track_process: _on_command_completed(process.pid) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 88267b75e2..1a0e362407 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -149,6 +149,8 @@ def __init__(self): self._last_check_memory_usage_time = time.time() self._check_memory_usage_last_error_report = datetime.min + self._cloud_init_completed = False # Only used when Extensions.WaitForCloudInit is enabled; note that this variable is always reset on service start. + # VM Size is reported via the heartbeat, default it here. self._vm_size = None @@ -458,6 +460,22 @@ def _initialize_goal_state(self, protocol): logger.info("The current Fabric goal state is older than the most recent FastTrack goal state; will skip it.\nFabric: {0}\nFastTrack: {1}", egs.created_on_timestamp, last_fast_track_timestamp) + def _wait_for_cloud_init(self): + if conf.get_wait_for_cloud_init() and not self._cloud_init_completed: + message = "Waiting for cloud-init to complete..." + logger.info(message) + add_event(op=WALAEventOperation.CloudInit, message=message) + try: + output = shellutil.run_command(["cloud-init", "status", "--wait"], timeout=conf.get_wait_for_cloud_init_timeout()) + message = "cloud-init completed\n{0}".format(output) + logger.info(message) + add_event(op=WALAEventOperation.CloudInit, message=message) + except Exception as e: + message = "An error occurred while waiting for cloud-init; will proceed to execute VM extensions. Extensions that have conflicts with cloud-init may fail.\n{0}".format(ustr(e)) + logger.error(message) + add_event(op=WALAEventOperation.CloudInit, message=message, is_success=False, log_event=False) + self._cloud_init_completed = True # Mark as completed even on error since we will proceed to execute extensions + def _get_vm_size(self, protocol): """ Including VMSize is meant to capture the architecture of the VM (i.e. arm64 VMs will @@ -562,6 +580,8 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_ # check for agent updates agent_update_handler.run(self._goal_state, self._processing_new_extensions_goal_state()) + self._wait_for_cloud_init() + try: if self._processing_new_extensions_goal_state(): if not self._extensions_summary.converged: diff --git a/tests/common/test_conf.py b/tests/common/test_conf.py index 972b289a79..1ae951bf9f 100644 --- a/tests/common/test_conf.py +++ b/tests/common/test_conf.py @@ -27,6 +27,8 @@ class TestConf(AgentTestCase): # -- These values *MUST* match those from data/test_waagent.conf EXPECTED_CONFIGURATION = { "Extensions.Enabled": True, + "Extensions.WaitForCloudInit": False, + "Extensions.WaitForCloudInitTimeout": 3600, "Provisioning.Agent": "auto", "Provisioning.DeleteRootPassword": True, "Provisioning.RegenerateSshHostKeyPair": True, diff --git a/tests/common/utils/test_shell_util.py b/tests/common/utils/test_shell_util.py index 3c6afc60e6..5eb5a83a6d 100644 --- a/tests/common/utils/test_shell_util.py +++ b/tests/common/utils/test_shell_util.py @@ -18,13 +18,14 @@ import os import signal import subprocess +import sys import tempfile import threading import unittest from azurelinuxagent.common.future import ustr import azurelinuxagent.common.utils.shellutil as shellutil -from tests.lib.tools import AgentTestCase, patch +from tests.lib.tools import AgentTestCase, patch, skip_if_predicate_true from tests.lib.miscellaneous_tools import wait_for, format_processes @@ -225,6 +226,12 @@ def test_run_command_should_raise_an_exception_when_it_cannot_execute_the_comman self.__it_should_raise_an_exception_when_it_cannot_execute_the_command( lambda: shellutil.run_command("nonexistent_command")) + @skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2") + def test_run_command_should_raise_an_exception_when_the_command_times_out(self): + with self.assertRaises(shellutil.CommandError) as context: + shellutil.run_command(["sleep", "5"], timeout=1) + self.assertIn("command timeout", context.exception.stderr, "The command did not time out") + def test_run_pipe_should_raise_an_exception_when_it_cannot_execute_the_pipe(self): self.__it_should_raise_an_exception_when_it_cannot_execute_the_command( lambda: shellutil.run_pipe([["ls", "-ld", "."], ["nonexistent_command"], ["wc", "-l"]])) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index c25585f143..aa39ccb55a 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -38,7 +38,7 @@ ExtHandlerPackage, ExtHandlerPackageList, Extension, VMStatus, ExtHandlerStatus, ExtensionStatus, \ VMAgentUpdateStatuses from azurelinuxagent.common.protocol.util import ProtocolUtil -from azurelinuxagent.common.utils import fileutil, textutil, timeutil +from azurelinuxagent.common.utils import fileutil, textutil, timeutil, shellutil from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME, AGENT_STATUS_FILE from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.utils.networkutil import FirewallCmdDirectCommands, AddFirewallRules @@ -980,7 +980,6 @@ def match_expected_info(): match_unexpected_errors() # Match on errors first, they can provide more info. match_expected_info() - def test_it_should_recreate_handler_env_on_service_startup(self): iterations = 5 @@ -1361,6 +1360,64 @@ def test_it_should_reset_legacy_blacklisted_agents_on_process_start(self): self.assertFalse(agent.is_blacklisted, "Legacy Agent should not be blacklisted") +class TestUpdateWaitForCloudInit(AgentTestCase): + @staticmethod + @contextlib.contextmanager + def create_mock_run_command(delay=None): + def run_command_mock(cmd, *args, **kwargs): + if cmd == ["cloud-init", "status", "--wait"]: + if delay is not None: + original_run_command(['sleep', str(delay)], *args, **kwargs) + return "cloud-init completed" + return original_run_command(cmd, *args, **kwargs) + original_run_command = shellutil.run_command + + with patch("azurelinuxagent.ga.update.shellutil.run_command", side_effect=run_command_mock) as run_command_patch: + yield run_command_patch + + def test_it_should_not_wait_for_cloud_init_by_default(self): + update_handler = UpdateHandler() + with self.create_mock_run_command() as run_command_patch: + update_handler._wait_for_cloud_init() + self.assertTrue(run_command_patch.call_count == 0, "'cloud-init status --wait' should not be called by default") + + def test_it_should_wait_for_cloud_init_when_requested(self): + update_handler = UpdateHandler() + with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True): + with self.create_mock_run_command() as run_command_patch: + update_handler._wait_for_cloud_init() + self.assertEqual(1, run_command_patch.call_count, "'cloud-init status --wait' should have be called once") + + @skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2") + def test_it_should_enforce_timeout_waiting_for_cloud_init(self): + update_handler = UpdateHandler() + with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True): + with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init_timeout", return_value=1): + with self.create_mock_run_command(delay=5): + with patch("azurelinuxagent.ga.update.logger.error") as mock_logger: + update_handler._wait_for_cloud_init() + call_args = [args for args, _ in mock_logger.call_args_list if "An error occurred while waiting for cloud-init" in args[0]] + self.assertTrue( + len(call_args) == 1 and len(call_args[0]) == 1 and "command timeout" in call_args[0][0], + "Expected a timeout waiting for cloud-init. Log calls: {0}".format(mock_logger.call_args_list)) + + def test_update_handler_should_wait_for_cloud_init_after_agent_update_and_before_extension_processing(self): + method_calls = [] + + agent_update_handler = Mock() + agent_update_handler.run = lambda *_, **__: method_calls.append("AgentUpdateHandler.run()") + + exthandlers_handler = Mock() + exthandlers_handler.run = lambda *_, **__: method_calls.append("ExtHandlersHandler.run()") + + with mock_wire_protocol(DATA_FILE) as protocol: + with mock_update_handler(protocol, iterations=1, agent_update_handler=agent_update_handler, exthandlers_handler=exthandlers_handler) as update_handler: + with patch('azurelinuxagent.ga.update.UpdateHandler._wait_for_cloud_init', side_effect=lambda *_, **__: method_calls.append("UpdateHandler._wait_for_cloud_init()")): + update_handler.run() + + self.assertListEqual(["AgentUpdateHandler.run()", "UpdateHandler._wait_for_cloud_init()", "ExtHandlersHandler.run()"], method_calls, "Wait for cloud-init should happen after agent update and before extension processing") + + class UpdateHandlerRunTestCase(AgentTestCase): def _test_run(self, autoupdate_enabled=False, check_daemon_running=False, expected_exit_code=0, emit_restart_event=None): fileutil.write_file(conf.get_agent_pid_file_path(), ustr(42)) diff --git a/tests/lib/mock_update_handler.py b/tests/lib/mock_update_handler.py index f0b311abe2..03d7a44521 100644 --- a/tests/lib/mock_update_handler.py +++ b/tests/lib/mock_update_handler.py @@ -86,9 +86,9 @@ def patch_object(target, attribute): try: with patch("azurelinuxagent.ga.exthandlers.get_exthandlers_handler", return_value=exthandlers_handler): - with patch("azurelinuxagent.ga.agent_update_handler.get_agent_update_handler", return_value=agent_update_handler): + with patch("azurelinuxagent.ga.update.get_agent_update_handler", return_value=agent_update_handler): with patch("azurelinuxagent.ga.remoteaccess.get_remote_access_handler", return_value=remote_access_handler): - with patch("azurelinuxagent.common.conf.get_autoupdate_enabled", return_value=autoupdate_enabled): + with patch("azurelinuxagent.ga.update.conf.get_autoupdate_enabled", return_value=autoupdate_enabled): with patch.object(UpdateHandler, "is_running", PropertyMock(side_effect=is_running)): with patch('azurelinuxagent.ga.update.time.sleep', side_effect=lambda _: mock_sleep(0.001)) as sleep: with patch('sys.exit', side_effect=lambda _: 0) as mock_exit: diff --git a/tests/test_agent.py b/tests/test_agent.py index 414faa7266..0da6a2a853 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -53,6 +53,8 @@ Extensions.Enabled = True Extensions.GoalStatePeriod = 6 Extensions.InitialGoalStatePeriod = 6 +Extensions.WaitForCloudInit = False +Extensions.WaitForCloudInitTimeout = 3600 HttpProxy.Host = None HttpProxy.Port = None Lib.Dir = /var/lib/waagent diff --git a/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py b/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py index fbe53a1bdc..4b650e8641 100644 --- a/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py +++ b/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py @@ -159,15 +159,25 @@ def create_environment_list(self) -> List[Dict[str, Any]]: for image in images_info: if image in skip_images_info: continue - # 'image.urn' can actually be the URL to a VHD if the runbook provided it in the 'image' parameter + # 'image.urn' can actually be the URL to a VHD or an image from a gallery if the runbook provided it in the 'image' parameter if self._is_vhd(image.urn): marketplace_image = "" vhd = image.urn image_name = urllib.parse.urlparse(vhd).path.split('/')[-1] # take the last fragment of the URL's path (e.g. "RHEL_8_Standard-8.3.202006170423.vhd") + shared_gallery = "" + elif self._is_image_from_gallery(image.urn): + marketplace_image = "" + vhd = "" + image_name = self._get_name_of_image_from_gallery(image.urn) + shared_gallery = image.urn else: marketplace_image = image.urn vhd = "" image_name = self._get_image_name(image.urn) + shared_gallery = "" + + if test_suite_info.executes_on_scale_set and (vhd != "" or shared_gallery != ""): + raise Exception("VHDS and images from galleries are currently not supported on scale sets.") location: str = self._get_location(test_suite_info, image) if location is None: @@ -194,6 +204,7 @@ def create_environment_list(self) -> List[Dict[str, Any]]: env_name=f"{image_name}-{test_suite_info.name}", marketplace_image=marketplace_image, vhd=vhd, + shared_gallery=shared_gallery, location=location, vm_size=vm_size, test_suite_info=test_suite_info) @@ -206,9 +217,6 @@ def create_environment_list(self) -> List[Dict[str, Any]]: env["c_test_suites"].append(test_suite_info) else: if test_suite_info.executes_on_scale_set: - # TODO: Add support for VHDs - if vhd != "": - raise Exception("VHDS are currently not supported on scale sets.") env = self.create_vmss_environment( env_name=env_name, marketplace_image=marketplace_image, @@ -220,18 +228,18 @@ def create_environment_list(self) -> List[Dict[str, Any]]: env_name=env_name, marketplace_image=marketplace_image, vhd=vhd, + shared_gallery=shared_gallery, location=location, vm_size=vm_size, test_suite_info=test_suite_info) shared_environments[env_name] = env - if test_suite_info.template != '': - vm_tags = env.get("vm_tags") - if vm_tags is not None: - if "templates" not in vm_tags: - vm_tags["templates"] = test_suite_info.template - else: - vm_tags["templates"] += "," + test_suite_info.template + if test_suite_info.template != '': + vm_tags = env["vm_tags"] + if "templates" not in vm_tags: + vm_tags["templates"] = test_suite_info.template + else: + vm_tags["templates"] += "," + test_suite_info.template environments.extend(shared_environments.values()) @@ -330,7 +338,7 @@ def create_existing_vmss_environment(self) -> Dict[str, Any]: "c_test_suites": loader.test_suites, } - def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str, location: str, vm_size: str, test_suite_info: TestSuiteInfo) -> Dict[str, Any]: + def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str, shared_gallery: str, location: str, vm_size: str, test_suite_info: TestSuiteInfo) -> Dict[str, Any]: # # Custom ARM templates (to create the test VMs) require special handling. These templates are processed by the azure_update_arm_template # hook, which does not have access to the runbook variables. Instead, we use a dummy VM tag named "templates" and pass the @@ -339,11 +347,9 @@ def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str, # share the same test environment. Similarly, we use a dummy VM tag named "allow_ssh" to pass the value of the "allow_ssh" runbook parameter. # vm_tags = {} - if test_suite_info.template != '': - vm_tags["templates"] = test_suite_info.template if self.runbook.allow_ssh != '': vm_tags["allow_ssh"] = self.runbook.allow_ssh - return { + environment = { "c_platform": [ { "type": "azure", @@ -366,6 +372,7 @@ def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str, "azure": { "marketplace": marketplace_image, "vhd": vhd, + "shared_gallery": shared_gallery, "location": location, "vm_size": vm_size } @@ -383,6 +390,18 @@ def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str, "vm_tags": vm_tags } + if shared_gallery != '': + # Currently all the images in our shared gallery require secure boot + environment['c_platform'][0]['requirement']["features"] = { + "items": [ + { + "type": "Security_Profile", + "security_profile": "secureboot" + } + ] + } + return environment + def create_vmss_environment(self, env_name: str, marketplace_image: str, location: str, vm_size: str, test_suite_info: TestSuiteInfo) -> Dict[str, Any]: return { "c_platform": [ @@ -406,7 +425,8 @@ def create_vmss_environment(self, env_name: str, marketplace_image: str, locatio "c_location": location, "c_image": marketplace_image, "c_is_vhd": False, - "c_vm_size": vm_size + "c_vm_size": vm_size, + "vm_tags": {} } def _get_runbook_images(self, loader: AgentTestLoader) -> List[VmImageInfo]: @@ -420,12 +440,12 @@ def _get_runbook_images(self, loader: AgentTestLoader) -> List[VmImageInfo]: if images is not None: return images - # If it is not image or image set, it must be a URN or VHD - if not self._is_urn(self.runbook.image) and not self._is_vhd(self.runbook.image): - raise Exception(f"The 'image' parameter must be an image, an image set name, a urn, or a vhd: {self.runbook.image}") + # If it is not image or image set, it must be a URN, VHD, or an image from a gallery + if not self._is_urn(self.runbook.image) and not self._is_vhd(self.runbook.image) and not self._is_image_from_gallery(self.runbook.image): + raise Exception(f"The 'image' parameter must be an image, image set name, urn, vhd, or an image from a shared gallery: {self.runbook.image}") i = VmImageInfo() - i.urn = self.runbook.image # Note that this could be a URN or the URI for a VHD + i.urn = self.runbook.image # Note that this could be a URN or the URI for a VHD, or an image from a shared gallery i.locations = [] i.vm_sizes = [] @@ -536,6 +556,20 @@ def _is_vhd(vhd: str) -> bool: parsed = urllib.parse.urlparse(vhd) return parsed.scheme == 'https' and parsed.netloc != "" and parsed.path != "" + # Images from a gallery are given as "//". + _IMAGE_FROM_GALLERY = re.compile(r"(?P[^/]+)/(?P[^/]+)/(?P[^/]+)") + + @staticmethod + def _is_image_from_gallery(image: str) -> bool: + return AgentTestSuitesCombinator._IMAGE_FROM_GALLERY.match(image) is not None + + @staticmethod + def _get_name_of_image_from_gallery(image: str) -> bool: + match = AgentTestSuitesCombinator._IMAGE_FROM_GALLERY.match(image) + if match is None: + raise Exception(f"Invalid image from gallery: {image}") + return match.group('image') + @staticmethod def _report_test_result( suite_name: str, diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index 9181e9189c..ed0b816b12 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -29,7 +29,7 @@ variable: # Test suites to execute # - name: test_suites - value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update" + value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update, agent_wait_for_cloud_init" # # Parameters used to create test VMs diff --git a/tests_e2e/test_suites/agent_wait_for_cloud_init.yml b/tests_e2e/test_suites/agent_wait_for_cloud_init.yml new file mode 100644 index 0000000000..727803811e --- /dev/null +++ b/tests_e2e/test_suites/agent_wait_for_cloud_init.yml @@ -0,0 +1,13 @@ +# +# This test verifies that the Agent waits for cloud-init to complete before it starts processing extensions. +# +# NOTE: This test is not fully automated. It requires a custom image where the test Agent has been installed and Extensions.WaitForCloudInit is enabled in waagent.conf. +# To execute it manually, create a custom image and use the 'image' runbook parameter, for example: "-v: image:gallery/wait-cloud-init/1.0.1". +# +name: "AgentWaitForCloudInit" +tests: + - "agent_wait_for_cloud_init/agent_wait_for_cloud_init.py" +template: "agent_wait_for_cloud_init/add_cloud_init_script.py" +install_test_agent: false +# Dummy image, since the parameter is required. The actual image needs to be passed as a parameter to the runbook. +images: "ubuntu_2204" diff --git a/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py b/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py index 6f0a562cd2..af3bc738a5 100755 --- a/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py +++ b/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py @@ -32,18 +32,11 @@ def update(self, template: Dict[str, Any], is_lisa_template: bool) -> None: # # NOTE: LISA's template uses this function to generate the value for osProfile.linuxConfiguration. The function is - # under the 'lisa' namespace. + # under the 'lisa' namespace. We set 'provisionVMAgent' to False. # # "getLinuxConfiguration": { # "parameters": [ - # { - # "name": "keyPath", - # "type": "string" - # }, - # { - # "name": "publicKeyData", - # "type": "string" - # } + # ... # ], # "output": { # "type": "object", @@ -62,31 +55,9 @@ def update(self, template: Dict[str, Any], is_lisa_template: bool) -> None: # } # } # - # The code below sets template['functions'][i]['members']['getLinuxConfiguration']['output']['value']['provisionVMAgent'] to True, - # where template['functions'][i] is the 'lisa' namespace. - # - functions = template.get("functions") - if functions is None: - raise Exception('Cannot find "functions" in the LISA template.') - for namespace in functions: - name = namespace.get("namespace") - if name is None: - raise Exception(f'Cannot find "namespace" in the LISA template: {namespace}') - if name == "lisa": - members = namespace.get('members') - if members is None: - raise Exception(f'Cannot find the members of the lisa namespace in the LISA template: {namespace}') - get_linux_configuration = members.get('getLinuxConfiguration') - if get_linux_configuration is None: - raise Exception(f'Cannot find the "getLinuxConfiguration" function the lisa namespace in the LISA template: {namespace}') - output = get_linux_configuration.get('output') - if output is None: - raise Exception(f'Cannot find the "output" of the getLinuxConfiguration function in the LISA template: {get_linux_configuration}') - value = output.get('value') - if value is None: - raise Exception(f"Cannot find the output's value of the getLinuxConfiguration function in the LISA template: {get_linux_configuration}") - value['provisionVMAgent'] = False - break - else: - raise Exception(f'Cannot find the "lisa" namespace in the LISA template: {functions}') + get_linux_configuration = self.get_lisa_function(template, 'getLinuxConfiguration') + output = self.get_function_output(get_linux_configuration) + if output.get('customData') is not None: + raise Exception(f"The getOSProfile function already has a 'customData'. Won't override it. Definition: {get_linux_configuration}") + output['provisionVMAgent'] = False diff --git a/tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py b/tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py new file mode 100755 index 0000000000..1fbc60adc4 --- /dev/null +++ b/tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import base64 + +from typing import Any, Dict + +from tests_e2e.tests.agent_wait_for_cloud_init.agent_wait_for_cloud_init import AgentWaitForCloudInit +from tests_e2e.tests.lib.update_arm_template import UpdateArmTemplate + + +class AddCloudInitScript(UpdateArmTemplate): + """ + Adds AgentWaitForCloudInit.CloudInitScript to the ARM template as osProfile.customData. + """ + def update(self, template: Dict[str, Any], is_lisa_template: bool) -> None: + if not is_lisa_template: + raise Exception('This test can only customize LISA ARM templates.') + + # + # cloud-init configuration needs to be added in the osProfile.customData property as a base64-encoded string. + # + # LISA uses the getOSProfile function to generate the value for osProfile; add customData to its output, checking that we do not + # override any existing value (the current LISA template does not have any). + # + # "getOSProfile": { + # "parameters": [ + # ... + # ], + # "output": { + # "type": "object", + # "value": { + # "computername": "[parameters('computername')]", + # "adminUsername": "[parameters('admin_username')]", + # "adminPassword": "[if(parameters('has_password'), parameters('admin_password'), json('null'))]", + # "linuxConfiguration": "[if(parameters('has_linux_configuration'), parameters('linux_configuration'), json('null'))]" + # } + # } + # } + # + encoded_script = base64.b64encode(AgentWaitForCloudInit.CloudInitScript.encode('utf-8')).decode('utf-8') + + get_os_profile = self.get_lisa_function(template, 'getOSProfile') + output = self.get_function_output(get_os_profile) + if output.get('customData') is not None: + raise Exception(f"The getOSProfile function already has a 'customData'. Won't override it. Definition: {get_os_profile}") + output['customData'] = encoded_script + diff --git a/tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py b/tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py new file mode 100755 index 0000000000..d9b4ecaef1 --- /dev/null +++ b/tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time + +from assertpy import fail + +from tests_e2e.tests.lib.agent_test import AgentVmTest +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.shell import CommandError +from tests_e2e.tests.lib.ssh_client import SshClient + + +class AgentWaitForCloudInit(AgentVmTest): + """ + This test verifies that the Agent waits for cloud-init to complete before it starts processing extensions. + + To do this, it adds 'CloudInitScript' in cloud-init's custom data. The script ensures first that the Agent + is waiting for cloud-init, and then sleeps for a couple of minutes before completing. The scripts appends + a set of known messages to waagent.log, and the test simply verifies that the messages are present in the + log in the expected order, and that they occur before the Agent reports that it is processing extensions. + """ + CloudInitScript = """#!/usr/bin/env bash + set -euox pipefail + + echo ">>> $(date) cloud-init script begin" >> /var/log/waagent.log + while ! grep 'Waiting for cloud-init to complete' /var/log/waagent.log; do + sleep 15 + done + echo ">>> $(date) The Agent is waiting for cloud-init, will pause for a couple of minutes" >> /var/log/waagent.log + sleep 120 + echo ">>> $(date) cloud-init script end" >> /var/log/waagent.log + """ + + def run(self): + ssh_client: SshClient = self._context.create_ssh_client() + + log.info("Waiting for Agent to start processing extensions") + for _ in range(15): + try: + ssh_client.run_command("grep 'ProcessExtensionsGoalState started' /var/log/waagent.log") + break + except CommandError: + log.info("The Agent has not started to process extensions, will check again after a short delay") + time.sleep(60) + else: + raise Exception("Timeout while waiting for the Agent to start processing extensions") + + log.info("The Agent has started to process extensions") + + output = ssh_client.run_command( + "grep -E '^>>>|" + + "INFO ExtHandler ExtHandler cloud-init completed|" + + "INFO ExtHandler ExtHandler ProcessExtensionsGoalState started' /var/log/waagent.log") + + output = output.rstrip().splitlines() + + expected = [ + 'cloud-init script begin', + 'The Agent is waiting for cloud-init, will pause for a couple of minutes', + 'cloud-init script end', + 'cloud-init completed', + 'ProcessExtensionsGoalState started' + ] + + indent = lambda lines: "\n".join([f" {ln}" for ln in lines]) + if len(output) == len(expected) and all([expected[i] in output[i] for i in range(len(expected))]): + log.info("The Agent waited for cloud-init before processing extensions.\nLog messages:\n%s", indent(output)) + else: + fail(f"The Agent did not wait for cloud-init before processing extensions.\nExpected:\n{indent(expected)}\nActual:\n{indent(output)}") + + +if __name__ == "__main__": + AgentWaitForCloudInit.run_from_command_line() + diff --git a/tests_e2e/tests/lib/update_arm_template.py b/tests_e2e/tests/lib/update_arm_template.py index af69fba048..010178ab9c 100644 --- a/tests_e2e/tests/lib/update_arm_template.py +++ b/tests_e2e/tests/lib/update_arm_template.py @@ -55,4 +55,87 @@ def get_resource_by_name(resources: List[Dict[str, Any]], resource_name: str, ty return item raise KeyError(f"Cannot find a resource {resource_name} of type {type_name} in the ARM template") + @staticmethod + def get_lisa_function(template: Dict[str, Any], function_name: str) -> Dict[str, Any]: + """ + Looks for the given function name in the LISA namespace and returns its definition. Raises KeyError if the function is not found. + """ + # + # NOTE: LISA's functions are in the "lisa" namespace, for example: + # + # "functions": [ + # { + # "namespace": "lisa", + # "members": { + # "getOSProfile": { + # "parameters": [ + # { + # "name": "computername", + # "type": "string" + # }, + # etc. + # ], + # "output": { + # "type": "object", + # "value": { + # "computername": "[parameters('computername')]", + # "adminUsername": "[parameters('admin_username')]", + # "adminPassword": "[if(parameters('has_password'), parameters('admin_password'), json('null'))]", + # "linuxConfiguration": "[if(parameters('has_linux_configuration'), parameters('linux_configuration'), json('null'))]" + # } + # } + # }, + # } + # } + # ] + functions = template.get("functions") + if functions is None: + raise Exception('Cannot find "functions" in the LISA template.') + for namespace in functions: + name = namespace.get("namespace") + if name is None: + raise Exception(f'Cannot find "namespace" in the LISA template: {namespace}') + if name == "lisa": + lisa_functions = namespace.get('members') + if lisa_functions is None: + raise Exception(f'Cannot find the members of the lisa namespace in the LISA template: {namespace}') + function_definition = lisa_functions.get(function_name) + if function_definition is None: + raise KeyError(f'Cannot find function {function_name} in the lisa namespace in the LISA template: {namespace}') + return function_definition + raise Exception(f'Cannot find the "lisa" namespace in the LISA template: {functions}') + + @staticmethod + def get_function_output(function: Dict[str, Any]) -> Dict[str, Any]: + """ + Returns the "value" property of the output for the given function. + + Sample function: + + { + "parameters": [ + { + "name": "computername", + "type": "string" + }, + etc. + ], + "output": { + "type": "object", + "value": { + "computername": "[parameters('computername')]", + "adminUsername": "[parameters('admin_username')]", + "adminPassword": "[if(parameters('has_password'), parameters('admin_password'), json('null'))]", + "linuxConfiguration": "[if(parameters('has_linux_configuration'), parameters('linux_configuration'), json('null'))]" + } + } + } + """ + output = function.get('output') + if output is None: + raise Exception(f'Cannot find the "output" of the given function: {function}') + value = output.get('value') + if value is None: + raise Exception(f"Cannot find the output's value of the given function: {function}") + return value From cc689f5b8a7c51385e5fa3bd4859500147b5d0cf Mon Sep 17 00:00:00 2001 From: maddieford <93676569+maddieford@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:34:24 -0800 Subject: [PATCH 2/4] Revert changes to publish_hostname in RedhatOSModernUtil (#3032) * Revert changes to publish_hostname in RedhatOSModernUtil * Fix pylint bad-super-call --- azurelinuxagent/common/osutil/redhat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/azurelinuxagent/common/osutil/redhat.py b/azurelinuxagent/common/osutil/redhat.py index 05a4b659df..2d8ff3d1e5 100644 --- a/azurelinuxagent/common/osutil/redhat.py +++ b/azurelinuxagent/common/osutil/redhat.py @@ -192,3 +192,10 @@ def restart_if(self, ifname, retries=3, wait=5): time.sleep(wait) else: logger.warn("exceeded restart retries") + + def publish_hostname(self, hostname): + # RedhatOSUtil was updated to conditionally run NetworkManager restart in response to a race condition between + # NetworkManager restart and the agent restarting the network interface during publish_hostname. Keeping the + # NetworkManager restart in RedhatOSModernUtil because the issue was not reproduced on these versions. + shellutil.run("service NetworkManager restart") + DefaultOSUtil.publish_hostname(self, hostname) From 20f06702462cf56cb6a96e6ec866deccb92f1cd3 Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Fri, 2 Feb 2024 14:36:30 -0800 Subject: [PATCH 3/4] Remove agent_wait_for_cloud_init from automated runs (#3034) Co-authored-by: narrieta --- tests_e2e/orchestrator/runbook.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index ed0b816b12..9181e9189c 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -29,7 +29,7 @@ variable: # Test suites to execute # - name: test_suites - value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update, agent_wait_for_cloud_init" + value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update" # # Parameters used to create test VMs From 4b484b8b9d3fd62eb3dc8ed9e591c66bd8b232e6 Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com> Date: Mon, 5 Feb 2024 10:09:38 -0800 Subject: [PATCH 4/4] Adding AutoUpdate.UpdateToLatestVersion new flag support (#3020) * support new flag * address comments * added more info * updated * address comments * resolving comment * updated --- README.md | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6d0296bfcc..5a5b126f2f 100644 --- a/README.md +++ b/README.md @@ -297,19 +297,38 @@ _Note_: setting up this parameter to more than a few minutes can make the state the VM be reported as unresponsive/unavailable on the Azure portal. Also, this setting affects how fast the agent starts executing extensions. -#### __AutoUpdate.Enabled__ +#### __AutoUpdate.UpdateToLatestVersion__ -_Type: Boolean_ +_Type: Boolean_ _Default: y_ -Enables auto-update of the Extension Handler. The Extension Handler is responsible +Enables auto-update of the Extension Handler. The Extension Handler is responsible for managing extensions and reporting VM status. The core functionality of the agent -is contained in the Extension Handler, and we encourage users to enable this option +is contained in the Extension Handler, and we encourage users to enable this option in order to maintain an up to date version. + +When this option is enabled, the Agent will install new versions when they become +available. When disabled, the Agent will not install any new versions, but it will use +the most recent version already installed on the VM. -On most distros the default value is 'y'. +_Notes_: +1. This option was added on version 2.10.0.8 of the Agent. For previous versions, see AutoUpdate.Enabled. +2. If both options are specified in waagent.conf, AutoUpdate.UpdateToLatestVersion overrides the value set for AutoUpdate.Enabled. +3. Changing config option requires a service restart to pick up the updated setting. + +For more information on the agent version, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#what-does-goal-state-agent-mean-in-waagent---version-output).
+For more information on the agent update, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#how-auto-update-works-for-extension-handler).
+For more information on the AutoUpdate.UpdateToLatestVersion vs AutoUpdate.Enabled, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#autoupdateenabled-vs-autoupdateupdatetolatestversion).
+ +#### __AutoUpdate.Enabled__ -For more information on the agent version, see our [FAQ](https://github.com/Azure/WALinuxAgent/wiki/FAQ#what-does-goal-state-agent-mean-in-waagent---version-output). +_Type: Boolean_ +_Default: y_ + +Enables auto-update of the Extension Handler. This flag is supported for legacy reasons and we strongly recommend using AutoUpdate.UpdateToLatestVersion instead. +The difference between these 2 flags is that, when set to 'n', AutoUpdate.Enabled will use the version of the Extension Handler that is pre-installed on the image, while AutoUpdate.UpdateToLatestVersion will use the most recent version that has already been installed on the VM (via auto-update). + +On most distros the default value is 'y'. #### __Provisioning.Agent__