Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recover primary nic if down after publishing hostname in RedhatOSUtil #3024

Merged
merged 27 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
37ed644
Check nic state and recover if down:
maddieford Jan 17, 2024
5029354
Fix typo
maddieford Jan 17, 2024
a3b694f
Fix state comparison
maddieford Jan 18, 2024
37f6f73
Fix pylint errors
maddieford Jan 18, 2024
297d773
Fix string comparison
maddieford Jan 18, 2024
81205d0
Report publish hostname failure in calling thread
maddieford Jan 18, 2024
54c3306
Add todo to check nic state for all distros where we reset network
maddieford Jan 18, 2024
c3baa2d
Update detection to check connection state and separate recover from …
maddieford Jan 19, 2024
0fdc67d
Pylint unused argument
maddieford Jan 19, 2024
4d6085f
refactor recover_nic argument
maddieford Jan 19, 2024
1f8b3a9
Network interface e2e test
maddieford Jan 19, 2024
36c4c77
e2e test for recovering the network interface on redhat distros
maddieford Jan 22, 2024
173e2a3
Merge branch 'develop' into recover_primary_nic
maddieford Jan 22, 2024
f020ac3
Only run scenario on distros which use RedhatOSUtil
maddieford Jan 22, 2024
73161f5
Merge branch 'recover_primary_nic' of github.com:maddieford/WALinuxAg…
maddieford Jan 22, 2024
59e1f69
Fix call to parent publish_hostname to include recover_nic arg
maddieford Jan 22, 2024
17e9843
Update comments in default os util
maddieford Jan 22, 2024
9ace1d6
Remove comment
maddieford Jan 22, 2024
d42e1e4
Fix comment
maddieford Jan 23, 2024
4d61b31
Do not do detection/recover on RedhatOSMOdernUtil
maddieford Jan 24, 2024
a827d82
Resolve PR comments
maddieford Feb 2, 2024
0555353
Make script executable
maddieford Feb 2, 2024
b145881
Revert pypy change
maddieford Feb 2, 2024
1a2e888
Merge branch 'develop' into recover_primary_nic
maddieford Feb 3, 2024
76e57ed
Fix publish hostname paramters
maddieford Feb 3, 2024
899ee89
Merge branch 'develop' into recover_primary_nic
maddieford Feb 5, 2024
f1b06df
Merge branch 'develop' into recover_primary_nic
maddieford Feb 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class WALAEventOperation:
HealthCheck = "HealthCheck"
HealthObservation = "HealthObservation"
HeartBeat = "HeartBeat"
HostnamePublishing = "HostnamePublishing"
HostPlugin = "HostPlugin"
HostPluginHeartbeat = "HostPluginHeartbeat"
HostPluginHeartbeatExtended = "HostPluginHeartbeatExtended"
Expand Down
11 changes: 10 additions & 1 deletion azurelinuxagent/common/osutil/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1190,11 +1190,20 @@ def restart_if(self, ifname, retries=3, wait=5):
else:
logger.warn("exceeded restart retries")

def publish_hostname(self, hostname):
def check_and_recover_nic_state(self, ifname):
# TODO: This should be implemented for all distros where we reset the network during publishing hostname. Currently it is only implemented in RedhatOSUtil.
pass

def publish_hostname(self, hostname, recover_nic=False):
maddieford marked this conversation as resolved.
Show resolved Hide resolved
"""
Publishes the provided hostname.
"""
self.set_dhcp_hostname(hostname)
self.set_hostname_record(hostname)
ifname = self.get_if_name()
self.restart_if(ifname)
if recover_nic:
self.check_and_recover_nic_state(ifname)

def set_scsi_disks_timeout(self, timeout):
for dev in os.listdir("/sys/block"):
Expand Down
2 changes: 1 addition & 1 deletion azurelinuxagent/common/osutil/gaia.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def set_hostname(self, hostname):
def set_dhcp_hostname(self, hostname):
logger.warn('set_dhcp_hostname is ignored on GAiA')

def publish_hostname(self, hostname):
def publish_hostname(self, hostname, recover_nic=False):
maddieford marked this conversation as resolved.
Show resolved Hide resolved
logger.warn('publish_hostname is ignored on GAiA')

def del_account(self, username):
Expand Down
4 changes: 2 additions & 2 deletions azurelinuxagent/common/osutil/iosxe.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ def set_hostname(self, hostname):
logger.warn("[{0}] failed with error: {1}, attempting fallback".format(' '.join(hostnamectl_cmd), ustr(e)))
DefaultOSUtil.set_hostname(self, hostname)

def publish_hostname(self, hostname):
def publish_hostname(self, hostname, recover_nic=False):
"""
Restart NetworkManager first before publishing hostname
"""
shellutil.run("service NetworkManager restart")
super(IosxeOSUtil, self).publish_hostname(hostname)
super(IosxeOSUtil, self).publish_hostname(hostname, recover_nic)

def register_agent_service(self):
return shellutil.run("systemctl enable waagent", chk_err=False)
Expand Down
89 changes: 81 additions & 8 deletions azurelinuxagent/common/osutil/redhat.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,11 @@ def set_hostname(self, hostname):
logger.warn("[{0}] failed, attempting fallback".format(' '.join(hostnamectl_cmd)))
DefaultOSUtil.set_hostname(self, hostname)

def get_nm_controlled(self):
ifname = self.get_if_name()
def get_nm_controlled(self, ifname):
maddieford marked this conversation as resolved.
Show resolved Hide resolved
filepath = "/etc/sysconfig/network-scripts/ifcfg-{0}".format(ifname)
nm_controlled_cmd = ['grep', 'NM_CONTROLLED=', filepath]
try:
result = shellutil.run_command(nm_controlled_cmd, log_error=False, encode_output=False).rstrip()
result = shellutil.run_command(nm_controlled_cmd, log_error=False).rstrip()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did testing and encode_output=False is not necessary, so I removed it


if result and len(result.split('=')) > 1:
# Remove trailing white space and ' or " characters
Expand All @@ -140,17 +139,87 @@ def get_nm_controlled(self):

return True

def publish_hostname(self, hostname):
def get_nic_operational_and_general_states(self, ifname):
"""
Checks the contents of /sys/class/net/{ifname}/operstate and the results of 'nmcli -g general.state device show {ifname}' to determine the state of the provided interface.
Raises an exception if the network interface state cannot be determined.
"""
filepath = "/sys/class/net/{0}/operstate".format(ifname)
nic_general_state_cmd = ['nmcli', '-g', 'general.state', 'device', 'show', ifname]
maddieford marked this conversation as resolved.
Show resolved Hide resolved
if not os.path.isfile(filepath):
msg = "Unable to determine primary network interface {0} state, because state file does not exist: {1}".format(ifname, filepath)
logger.warn(msg)
raise Exception(msg)

try:
nic_oper_state = fileutil.read_file(filepath).rstrip().lower()
nic_general_state = shellutil.run_command(nic_general_state_cmd, log_error=True).rstrip().lower()
maddieford marked this conversation as resolved.
Show resolved Hide resolved
if nic_oper_state != "up":
logger.warn("The primary network interface {0} operational state is '{1}'.".format(ifname, nic_oper_state))
else:
logger.info("The primary network interface {0} operational state is '{1}'.".format(ifname, nic_oper_state))
if nic_general_state != "100 (connected)":
logger.warn("The primary network interface {0} general state is '{1}'.".format(ifname, nic_general_state))
else:
logger.info("The primary network interface {0} general state is '{1}'.".format(ifname, nic_general_state))
return nic_oper_state, nic_general_state
except Exception as e:
msg = "Unexpected error while determining the primary network interface state: {0}".format(e)
logger.warn(msg)
raise Exception(msg)

def check_and_recover_nic_state(self, ifname):
"""
Checks if the provided network interface is in an 'up' state. If the network interface is in a 'down' state,
attempt to recover the interface by restarting the Network Manager service.

Raises an exception if an attempt to bring the interface into an 'up' state fails, or if the state
of the network interface cannot be determined.
"""
nic_operstate, nic_general_state = self.get_nic_operational_and_general_states(ifname)
if nic_operstate == "down" or "disconnected" in nic_general_state:
maddieford marked this conversation as resolved.
Show resolved Hide resolved
logger.info("Restarting the Network Manager service to recover network interface {0}".format(ifname))
self.restart_network_manager()
# Interface does not come up immediately after NetworkManager restart. Wait 5 seconds before checking
# network interface state.
time.sleep(5)
nic_operstate, nic_general_state = self.get_nic_operational_and_general_states(ifname)
# It is possible for network interface to be in an unknown or unmanaged state. Log warning if state is not
# down, disconnected, up, or connected
if nic_operstate != "up" or nic_general_state != "100 (connected)":
msg = "Network Manager restart failed to bring network interface {0} into 'up' and 'connected' state".format(ifname)
logger.warn(msg)
raise Exception(msg)
else:
logger.info("Network Manager restart successfully brought the network interface {0} into 'up' and 'connected' state".format(ifname))
elif nic_operstate != "up" or nic_general_state != "100 (connected)":
maddieford marked this conversation as resolved.
Show resolved Hide resolved
# We already logged a warning with the network interface state in get_nic_operstate(). Raise an exception
# for the env thread to send to telemetry.
raise Exception("The primary network interface {0} operational state is '{1}' and general state is '{2}'.".format(ifname, nic_operstate, nic_general_state))

def restart_network_manager(self):
shellutil.run("service NetworkManager restart")

def publish_hostname(self, hostname, recover_nic=False):
"""
Restart NetworkManager first before publishing hostname, only if the network interface is not controlled by the
NetworkManager service (as determined by NM_CONTROLLED=n in the interface configuration). If the NetworkManager
service is restarted before the agent publishes the hostname, and NM_controlled=y, a race condition may happen
between the NetworkManager service and the Guest Agent making changes to the network interface configuration
simultaneously.

Note: check_and_recover_nic_state(ifname) raises an Exception if an attempt to recover the network interface
fails, or if the network interface state cannot be determined. Callers should handle this exception by sending
an event to telemetry.

TODO: Improve failure reporting and add success reporting to telemetry for hostname changes. Right now we are only reporting failures to telemetry by raising an Exception in publish_hostname for the calling thread to handle by reporting the failure to telemetry.
"""
if not self.get_nm_controlled():
shellutil.run("service NetworkManager restart")
super(RedhatOSUtil, self).publish_hostname(hostname)
ifname = self.get_if_name()
nm_controlled = self.get_nm_controlled(ifname)
if not nm_controlled:
self.restart_network_manager()
# TODO: Current recover logic is only effective when the NetworkManager manages the network interface. Update the recover logic so it is effective even when NM_CONTROLLED=n
super(RedhatOSUtil, self).publish_hostname(hostname, recover_nic and nm_controlled)

def register_agent_service(self):
return shellutil.run("systemctl enable {0}".format(self.service_name), chk_err=False)
Expand Down Expand Up @@ -193,7 +262,11 @@ def restart_if(self, ifname, retries=3, wait=5):
else:
logger.warn("exceeded restart retries")

def publish_hostname(self, hostname):
def check_and_recover_nic_state(self, ifname):
# TODO: Implement and test a way to recover the network interface for RedhatOSModernUtil
pass

def publish_hostname(self, hostname, recover_nic=False):
# RedhatOSUtil was updated to conditionally run NetworkManager restart in response to a race condition between
# NetworkManager restart and the agent restarting the network interface during publish_hostname. Keeping the
# NetworkManager restart in RedhatOSModernUtil because the issue was not reproduced on these versions.
Expand Down
2 changes: 1 addition & 1 deletion azurelinuxagent/common/osutil/suse.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __init__(self):
super(SUSEOSUtil, self).__init__()
self.dhclient_name = 'wickedd-dhcp4'

def publish_hostname(self, hostname):
def publish_hostname(self, hostname, recover_nic=False):
self.set_dhcp_hostname(hostname)
self.set_hostname_record(hostname)
ifname = self.get_if_name()
Expand Down
6 changes: 5 additions & 1 deletion azurelinuxagent/ga/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,11 @@ def _operation(self):
self._hostname,
curr_hostname)
self._osutil.set_hostname(curr_hostname)
self._osutil.publish_hostname(curr_hostname)
try:
self._osutil.publish_hostname(curr_hostname, recover_nic=True)
except Exception as e:
msg = "Error while publishing the hostname: {0}".format(e)
add_event(AGENT_NAME, op=WALAEventOperation.HostnamePublishing, is_success=False, message=msg, log_event=False)
self._hostname = curr_hostname


Expand Down
8 changes: 8 additions & 0 deletions tests_e2e/test_suites/images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ images:
locations:
AzureChinaCloud: []
centos_610: "OpenLogic CentOS 6.10 latest"
centos_75: "OpenLogic CentOS 7.5 latest"
centos_79: "OpenLogic CentOS 7_9 latest"
centos_82:
urn: "OpenLogic CentOS 8_2 latest"
Expand Down Expand Up @@ -121,7 +122,14 @@ images:
AzureChinaCloud: []
AzureUSGovernment: []
oracle_610: "Oracle Oracle-Linux 6.10 latest"
oracle_75: "Oracle Oracle-Linux 7.5 latest"
oracle_79: "Oracle Oracle-Linux ol79-gen2 latest"
oracle_82: "Oracle Oracle-Linux ol82-gen2 latest"
rhel_610: "RedHat RHEL 6.10 latest"
rhel_75:
urn: "RedHat RHEL 7.5 latest"
locations:
AzureChinaCloud: []
rhel_79:
urn: "RedHat RHEL 7_9 latest"
locations:
Expand Down
17 changes: 17 additions & 0 deletions tests_e2e/test_suites/recover_network_interface.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# Brings the primary network interface down and checks that the agent can recover the network.
#
name: "RecoverNetworkInterface"
tests:
- "recover_network_interface/recover_network_interface.py"
images:
# TODO: This scenario should be run on all distros which bring the network interface down to publish hostname. Currently, only RedhatOSUtil attempts to recover the network interface if down after hostname publishing.
- "centos_79"
- "centos_75"
- "centos_82"
- "rhel_75"
- "rhel_79"
- "rhel_82"
- "oracle_75"
- "oracle_79"
- "oracle_82"
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# This test uses CSE to bring the network down and call check_and_recover_nic_state to bring the network back into an
# 'up' and 'connected' state. The intention of the test is to alert us if there is some change in newer distros which
maddieford marked this conversation as resolved.
Show resolved Hide resolved
# affects this logic.
#

import json
from typing import List, Dict, Any

from assertpy import fail, assert_that
from time import sleep

from tests_e2e.tests.lib.agent_test import AgentVmTest, TestSkipped
from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.virtual_machine_extension_client import VirtualMachineExtensionClient
from tests_e2e.tests.lib.vm_extension_identifier import VmExtensionIds


class RecoverNetworkInterface(AgentVmTest):
def __init__(self, context: AgentVmTestContext):
super().__init__(context)
self._context = context
self._ssh_client = context.create_ssh_client()
self._private_ip = context.vm.get_private_ip_address()
self._vm_password = ""

def add_vm_password(self):
# Add password to VM to help with debugging in case of failure
# REMOVE PWD FROM LOGS IF WE EVER MAKE THESE RUNS/LOGS PUBLIC
username = self._ssh_client.username
pwd = self._ssh_client.run_command("openssl rand -base64 32 | tr : .").rstrip()
self._vm_password = pwd
log.info("VM Username: {0}; VM Password: {1}".format(username, pwd))
self._ssh_client.run_command("echo '{0}:{1}' | sudo -S chpasswd".format(username, pwd))

def check_agent_reports_status(self):
status_updated = False
last_agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
log.info("Agent reported status at {0}".format(last_agent_status_time))
retries = 3

while retries > 0 and not status_updated:
agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
if agent_status_time != last_agent_status_time:
status_updated = True
log.info("Agent reported status at {0}".format(last_agent_status_time))
else:
retries -= 1
sleep(60)

if not status_updated:
fail("Agent hasn't reported status since {0} and ssh connection failed. Use the serial console in portal "
"to debug".format(last_agent_status_time))

def run(self):
# Add password to VM and log. This allows us to debug with serial console if necessary
log.info("")
log.info("Adding password to the VM to use for debugging in case necessary...")
self.add_vm_password()

# Skip the test if NM_CONTROLLED=n. The current recover logic does not work in this case
result = self._ssh_client.run_command("recover_network_interface-get_nm_controlled.py", use_sudo=True)
if "Interface is NOT NM controlled" in result:
raise TestSkipped("Current recover method will not work on interfaces where NM_Controlled=n")

# Get the primary network interface name
ifname = self._ssh_client.run_command("pypy3 -c 'from azurelinuxagent.common.osutil.redhat import RedhatOSUtil; print(RedhatOSUtil().get_if_name())'").rstrip()
# The interface name needs to be in double quotes for the pypy portion of the script
formatted_ifname = f'"{ifname}"'

# The script should bring the primary network interface down and use the agent to recover the interface. These
# commands will bring the network down, so they should be executed on the machine using CSE instead of ssh.
script = f"""
maddieford marked this conversation as resolved.
Show resolved Hide resolved
set -euxo pipefail
ifdown {ifname};
nic_state=$(nmcli -g general.state device show {ifname})
echo Primary network interface state before recovering: $nic_state
source /home/{self._context.username}/bin/set-agent-env;
pypy3 -c 'from azurelinuxagent.common.osutil.redhat import RedhatOSUtil; RedhatOSUtil().check_and_recover_nic_state({formatted_ifname})';
nic_state=$(nmcli -g general.state device show {ifname});
echo Primary network interface state after recovering: $nic_state
"""
log.info("")
log.info("Using CSE to bring the primary network interface down and call the OSUtil to bring the interface back up. Command to execute: {0}".format(script))
custom_script = VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, resource_name="CustomScript")
custom_script.enable(
protected_settings={
'commandToExecute': script
}
)

# Check that the interface was down and brought back up in instance view
log.info("")
log.info("Checking the instance view to confirm the primary network interface was brought down and successfully recovered by the agent...")
instance_view = custom_script.get_instance_view()
log.info("Instance view for custom script after enable is: {0}".format(json.dumps(instance_view.serialize(), indent=4)))
assert_that(len(instance_view.statuses)).described_as("Instance view should have a status for CustomScript").is_greater_than(0)
assert_that(instance_view.statuses[0].message).described_as("The primary network interface should be in a disconnected state before the attempt to recover").contains("Primary network interface state before recovering: 30 (disconnected)")
assert_that(instance_view.statuses[0].message).described_as("The primary network interface should be in a connected state after the attempt to recover").contains("Primary network interface state after recovering: 100 (connected)")

# Check that the agent is successfully reporting status after recovering the network
log.info("")
log.info("Checking that the agent is reporting status after recovering the network...")
self.check_agent_reports_status()

log.info("")
log.info("The primary network interface was successfully recovered by the agent.")

def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
ignore_rules = [
#
# We may see temporary network unreachable warnings since we are bringing the network interface down
# 2024-02-01T23:40:03.563499Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: ac21bdd7-1a7a-4bba-b307-b9d5bc30da33 eTag: 941323814975149980]: Request failed: [Errno 101] Network is unreachable
#
{
'message': r"Error fetching the goal state: \[ProtocolError\] GET vmSettings.*Request failed: \[Errno 101\] Network is unreachable"
}
]
return ignore_rules


if __name__ == "__main__":
RecoverNetworkInterface.run_from_command_line()
Loading
Loading