Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[3006.x] Minions check dns when re-connecting to a master #66422

Merged
merged 4 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/63654.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix master ip detection when DNS records change
5 changes: 2 additions & 3 deletions conf/minion
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,8 @@
#ping_interval: 0

# To auto recover minions if master changes IP address (DDNS)
# auth_tries: 10
# auth_safemode: True
# ping_interval: 2
# master_alive_interval: 10
# master_tries: -1
#
# Minions won't know master is missing until a ping fails. After the ping fail,
# the minion will attempt authentication and likely fails out and cause a restart.
Expand Down
4 changes: 3 additions & 1 deletion doc/ref/configuration/minion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,9 @@ Default: ``0``

Configures how often, in seconds, the minion will verify that the current
master is alive and responding. The minion will try to establish a connection
to the next master in the list if it finds the existing one is dead.
to the next master in the list if it finds the existing one is dead. This
setting can also be used to detect master DNS record changes when a minion has
been disconnected.

.. code-block:: yaml

Expand Down
180 changes: 75 additions & 105 deletions salt/minion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2826,9 +2826,60 @@ def handle_event(self, package):
# we are not connected anymore
self.connected = False
log.info("Connection to master %s lost", self.opts["master"])
if self.opts["transport"] != "tcp":
self.schedule.delete_job(name=master_event(type="alive"))

log.info("Trying to tune in to next master from master-list")

if hasattr(self, "pub_channel"):
self.pub_channel.on_recv(None)
if hasattr(self.pub_channel, "auth"):
self.pub_channel.auth.invalidate()
if hasattr(self.pub_channel, "close"):
self.pub_channel.close()
if hasattr(self, "req_channel") and self.req_channel:
self.req_channel.close()
self.req_channel = None

# if eval_master finds a new master for us, self.connected
# will be True again on successful master authentication
try:
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)
except SaltClientError:
pass
Comment on lines +2846 to +2853
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
try:
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)
except SaltClientError:
pass
with contextlib.suppress(SaltClientError):
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)

https://docs.python.org/3/library/contextlib.html#contextlib.suppress

Although, not a blocker for merging the PR


if self.connected:
self.opts["master"] = master

if self.opts["master_type"] != "failover":
# modify the scheduled job to fire on reconnect
# re-init the subsystems to work with the new master
log.info(
"Re-initialising subsystems for new master %s",
self.opts["master"],
)

self.req_channel = salt.channel.client.AsyncReqChannel.factory(
self.opts, io_loop=self.io_loop
)

# put the current schedule into the new loaders
self.opts["schedule"] = self.schedule.option("schedule")
(
self.functions,
self.returners,
self.function_errors,
self.executors,
) = self._load_modules()
# make the schedule to use the new 'functions' loader
self.schedule.functions = self.functions
self.pub_channel.on_recv(self._handle_payload)
self._fire_master_minion_start()
log.info("Minion is ready to receive requests!")

# update scheduled job to run with the new master addr
if self.opts["transport"] != "tcp":
schedule = {
"function": "status.master",
Expand All @@ -2838,116 +2889,35 @@ def handle_event(self, package):
"return_job": False,
"kwargs": {
"master": self.opts["master"],
"connected": False,
"connected": True,
},
}
self.schedule.modify_job(
name=master_event(type="alive", master=self.opts["master"]),
schedule=schedule,
)
else:
# delete the scheduled job to don't interfere with the failover process
if self.opts["transport"] != "tcp":
self.schedule.delete_job(name=master_event(type="alive"))

log.info("Trying to tune in to next master from master-list")

if hasattr(self, "pub_channel"):
self.pub_channel.on_recv(None)
if hasattr(self.pub_channel, "auth"):
self.pub_channel.auth.invalidate()
if hasattr(self.pub_channel, "close"):
self.pub_channel.close()
if hasattr(self, "req_channel") and self.req_channel:
self.req_channel.close()
self.req_channel = None

# if eval_master finds a new master for us, self.connected
# will be True again on successful master authentication
try:
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)
except SaltClientError:
pass

if self.connected:
self.opts["master"] = master

# re-init the subsystems to work with the new master
log.info(
"Re-initialising subsystems for new master %s",
self.opts["master"],
)

self.req_channel = salt.channel.client.AsyncReqChannel.factory(
self.opts, io_loop=self.io_loop
)

# put the current schedule into the new loaders
self.opts["schedule"] = self.schedule.option("schedule")
(
self.functions,
self.returners,
self.function_errors,
self.executors,
) = self._load_modules()
# make the schedule to use the new 'functions' loader
self.schedule.functions = self.functions
self.pub_channel.on_recv(self._handle_payload)
self._fire_master_minion_start()
log.info("Minion is ready to receive requests!")

# update scheduled job to run with the new master addr
if self.opts["transport"] != "tcp":
schedule = {
"function": "status.master",
"seconds": self.opts["master_alive_interval"],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {
"master": self.opts["master"],
"connected": True,
},
}
self.schedule.modify_job(
name=master_event(
type="alive", master=self.opts["master"]
),
schedule=schedule,
)

if (
self.opts["master_failback"]
and "master_list" in self.opts
):
if self.opts["master"] != self.opts["master_list"][0]:
schedule = {
"function": "status.ping_master",
"seconds": self.opts[
"master_failback_interval"
],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {
"master": self.opts["master_list"][0]
},
}
self.schedule.modify_job(
name=master_event(type="failback"),
schedule=schedule,
)
else:
self.schedule.delete_job(
name=master_event(type="failback"), persist=True
)
else:
self.restart = True
self.io_loop.stop()
if self.opts["master_failback"] and "master_list" in self.opts:
if self.opts["master"] != self.opts["master_list"][0]:
schedule = {
"function": "status.ping_master",
"seconds": self.opts["master_failback_interval"],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {"master": self.opts["master_list"][0]},
}
self.schedule.modify_job(
name=master_event(type="failback"),
schedule=schedule,
)
else:
self.schedule.delete_job(
name=master_event(type="failback"), persist=True
)
else:
self.restart = True
self.io_loop.stop()

elif tag.startswith(master_event(type="connected")):
# handle this event only once. otherwise it will pollute the log
Expand Down
Empty file.
94 changes: 94 additions & 0 deletions tests/pytests/scenarios/dns/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import logging
import pathlib
import subprocess

import pytest

log = logging.getLogger(__name__)


@pytest.fixture(scope="package")
def master_alive_interval():
return 5


class HostsFile:
"""
Simple helper class for tests that need to modify /etc/hosts.
"""

def __init__(self, path, orig_text):
self._path = path
self._orig_text = orig_text

@property
def orig_text(self):
return self._orig_text

def __getattr__(self, key):
if key in ["_path", "_orig_text", "orig_text"]:
return self.__getattribute__(key)
return getattr(self._path, key)


@pytest.fixture
def etc_hosts():
hosts = pathlib.Path("/etc/hosts")
orig_text = hosts.read_text(encoding="utf-8")
hosts = HostsFile(hosts, orig_text)
try:
yield hosts
finally:
hosts.write_text(orig_text)


@pytest.fixture(scope="package")
def master(request, salt_factories):

subprocess.check_output(["ip", "addr", "add", "172.16.0.1/32", "dev", "lo"])

config_defaults = {
"open_mode": True,
"transport": request.config.getoption("--transport"),
}
config_overrides = {
"interface": "0.0.0.0",
}
factory = salt_factories.salt_master_daemon(
"master",
defaults=config_defaults,
overrides=config_overrides,
extra_cli_arguments_after_first_start_failure=["--log-level=info"],
)
with factory.started(start_timeout=180):
yield factory

try:
subprocess.check_output(["ip", "addr", "del", "172.16.0.1/32", "dev", "lo"])
except subprocess.CalledProcessError:
pass


@pytest.fixture(scope="package")
def salt_cli(master):
return master.salt_cli(timeout=180)


@pytest.fixture(scope="package")
def minion(master, master_alive_interval):
config_defaults = {
"transport": master.config["transport"],
}
port = master.config["ret_port"]
config_overrides = {
"master": f"master.local:{port}",
"publish_port": master.config["publish_port"],
"master_alive_interval": master_alive_interval,
}
factory = master.salt_minion_daemon(
"minion",
defaults=config_defaults,
overrides=config_overrides,
extra_cli_arguments_after_first_start_failure=["--log-level=info"],
)
return factory
Loading
Loading