diff --git a/changelog/63654.fixed.md b/changelog/63654.fixed.md new file mode 100644 index 00000000000..d39d0212a0a --- /dev/null +++ b/changelog/63654.fixed.md @@ -0,0 +1 @@ +Fix master ip detection when DNS records change diff --git a/conf/minion b/conf/minion index a54f1659a79..70cbe8934a4 100644 --- a/conf/minion +++ b/conf/minion @@ -271,9 +271,8 @@ #ping_interval: 0 # To auto recover minions if master changes IP address (DDNS) -# auth_tries: 10 -# auth_safemode: True -# ping_interval: 2 +# master_alive_interval: 10 +# master_tries: -1 # # Minions won't know master is missing until a ping fails. After the ping fail, # the minion will attempt authentication and likely fails out and cause a restart. diff --git a/doc/ref/configuration/minion.rst b/doc/ref/configuration/minion.rst index 0f99da2f1c0..925b54cc47f 100644 --- a/doc/ref/configuration/minion.rst +++ b/doc/ref/configuration/minion.rst @@ -291,7 +291,9 @@ Default: ``0`` Configures how often, in seconds, the minion will verify that the current master is alive and responding. The minion will try to establish a connection -to the next master in the list if it finds the existing one is dead. +to the next master in the list if it finds the existing one is dead. This +setting can also be used to detect master DNS record changes when a minion has +been disconnected. .. code-block:: yaml diff --git a/salt/minion.py b/salt/minion.py index e6a9521766a..12c9a86ba2a 100644 --- a/salt/minion.py +++ b/salt/minion.py @@ -2826,9 +2826,60 @@ def handle_event(self, package): # we are not connected anymore self.connected = False log.info("Connection to master %s lost", self.opts["master"]) + if self.opts["transport"] != "tcp": + self.schedule.delete_job(name=master_event(type="alive")) + + log.info("Trying to tune in to next master from master-list") + + if hasattr(self, "pub_channel"): + self.pub_channel.on_recv(None) + if hasattr(self.pub_channel, "auth"): + self.pub_channel.auth.invalidate() + if hasattr(self.pub_channel, "close"): + self.pub_channel.close() + if hasattr(self, "req_channel") and self.req_channel: + self.req_channel.close() + self.req_channel = None + + # if eval_master finds a new master for us, self.connected + # will be True again on successful master authentication + try: + master, self.pub_channel = yield self.eval_master( + opts=self.opts, + failed=True, + failback=tag.startswith(master_event(type="failback")), + ) + except SaltClientError: + pass + + if self.connected: + self.opts["master"] = master - if self.opts["master_type"] != "failover": - # modify the scheduled job to fire on reconnect + # re-init the subsystems to work with the new master + log.info( + "Re-initialising subsystems for new master %s", + self.opts["master"], + ) + + self.req_channel = salt.channel.client.AsyncReqChannel.factory( + self.opts, io_loop=self.io_loop + ) + + # put the current schedule into the new loaders + self.opts["schedule"] = self.schedule.option("schedule") + ( + self.functions, + self.returners, + self.function_errors, + self.executors, + ) = self._load_modules() + # make the schedule to use the new 'functions' loader + self.schedule.functions = self.functions + self.pub_channel.on_recv(self._handle_payload) + self._fire_master_minion_start() + log.info("Minion is ready to receive requests!") + + # update scheduled job to run with the new master addr if self.opts["transport"] != "tcp": schedule = { "function": "status.master", @@ -2838,116 +2889,35 @@ def handle_event(self, package): "return_job": False, "kwargs": { "master": self.opts["master"], - "connected": False, + "connected": True, }, } self.schedule.modify_job( name=master_event(type="alive", master=self.opts["master"]), schedule=schedule, ) - else: - # delete the scheduled job to don't interfere with the failover process - if self.opts["transport"] != "tcp": - self.schedule.delete_job(name=master_event(type="alive")) - - log.info("Trying to tune in to next master from master-list") - - if hasattr(self, "pub_channel"): - self.pub_channel.on_recv(None) - if hasattr(self.pub_channel, "auth"): - self.pub_channel.auth.invalidate() - if hasattr(self.pub_channel, "close"): - self.pub_channel.close() - if hasattr(self, "req_channel") and self.req_channel: - self.req_channel.close() - self.req_channel = None - - # if eval_master finds a new master for us, self.connected - # will be True again on successful master authentication - try: - master, self.pub_channel = yield self.eval_master( - opts=self.opts, - failed=True, - failback=tag.startswith(master_event(type="failback")), - ) - except SaltClientError: - pass - - if self.connected: - self.opts["master"] = master - - # re-init the subsystems to work with the new master - log.info( - "Re-initialising subsystems for new master %s", - self.opts["master"], - ) - - self.req_channel = salt.channel.client.AsyncReqChannel.factory( - self.opts, io_loop=self.io_loop - ) - - # put the current schedule into the new loaders - self.opts["schedule"] = self.schedule.option("schedule") - ( - self.functions, - self.returners, - self.function_errors, - self.executors, - ) = self._load_modules() - # make the schedule to use the new 'functions' loader - self.schedule.functions = self.functions - self.pub_channel.on_recv(self._handle_payload) - self._fire_master_minion_start() - log.info("Minion is ready to receive requests!") - - # update scheduled job to run with the new master addr - if self.opts["transport"] != "tcp": - schedule = { - "function": "status.master", - "seconds": self.opts["master_alive_interval"], - "jid_include": True, - "maxrunning": 1, - "return_job": False, - "kwargs": { - "master": self.opts["master"], - "connected": True, - }, - } - self.schedule.modify_job( - name=master_event( - type="alive", master=self.opts["master"] - ), - schedule=schedule, - ) - if ( - self.opts["master_failback"] - and "master_list" in self.opts - ): - if self.opts["master"] != self.opts["master_list"][0]: - schedule = { - "function": "status.ping_master", - "seconds": self.opts[ - "master_failback_interval" - ], - "jid_include": True, - "maxrunning": 1, - "return_job": False, - "kwargs": { - "master": self.opts["master_list"][0] - }, - } - self.schedule.modify_job( - name=master_event(type="failback"), - schedule=schedule, - ) - else: - self.schedule.delete_job( - name=master_event(type="failback"), persist=True - ) - else: - self.restart = True - self.io_loop.stop() + if self.opts["master_failback"] and "master_list" in self.opts: + if self.opts["master"] != self.opts["master_list"][0]: + schedule = { + "function": "status.ping_master", + "seconds": self.opts["master_failback_interval"], + "jid_include": True, + "maxrunning": 1, + "return_job": False, + "kwargs": {"master": self.opts["master_list"][0]}, + } + self.schedule.modify_job( + name=master_event(type="failback"), + schedule=schedule, + ) + else: + self.schedule.delete_job( + name=master_event(type="failback"), persist=True + ) + else: + self.restart = True + self.io_loop.stop() elif tag.startswith(master_event(type="connected")): # handle this event only once. otherwise it will pollute the log diff --git a/tests/pytests/scenarios/dns/__init__.py b/tests/pytests/scenarios/dns/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/pytests/scenarios/dns/conftest.py b/tests/pytests/scenarios/dns/conftest.py new file mode 100644 index 00000000000..254e8ee9a28 --- /dev/null +++ b/tests/pytests/scenarios/dns/conftest.py @@ -0,0 +1,94 @@ +import logging +import pathlib +import subprocess + +import pytest + +log = logging.getLogger(__name__) + + +@pytest.fixture(scope="package") +def master_alive_interval(): + return 5 + + +class HostsFile: + """ + Simple helper class for tests that need to modify /etc/hosts. + """ + + def __init__(self, path, orig_text): + self._path = path + self._orig_text = orig_text + + @property + def orig_text(self): + return self._orig_text + + def __getattr__(self, key): + if key in ["_path", "_orig_text", "orig_text"]: + return self.__getattribute__(key) + return getattr(self._path, key) + + +@pytest.fixture +def etc_hosts(): + hosts = pathlib.Path("/etc/hosts") + orig_text = hosts.read_text(encoding="utf-8") + hosts = HostsFile(hosts, orig_text) + try: + yield hosts + finally: + hosts.write_text(orig_text) + + +@pytest.fixture(scope="package") +def master(request, salt_factories): + + subprocess.check_output(["ip", "addr", "add", "172.16.0.1/32", "dev", "lo"]) + + config_defaults = { + "open_mode": True, + "transport": request.config.getoption("--transport"), + } + config_overrides = { + "interface": "0.0.0.0", + } + factory = salt_factories.salt_master_daemon( + "master", + defaults=config_defaults, + overrides=config_overrides, + extra_cli_arguments_after_first_start_failure=["--log-level=info"], + ) + with factory.started(start_timeout=180): + yield factory + + try: + subprocess.check_output(["ip", "addr", "del", "172.16.0.1/32", "dev", "lo"]) + except subprocess.CalledProcessError: + pass + + +@pytest.fixture(scope="package") +def salt_cli(master): + return master.salt_cli(timeout=180) + + +@pytest.fixture(scope="package") +def minion(master, master_alive_interval): + config_defaults = { + "transport": master.config["transport"], + } + port = master.config["ret_port"] + config_overrides = { + "master": f"master.local:{port}", + "publish_port": master.config["publish_port"], + "master_alive_interval": master_alive_interval, + } + factory = master.salt_minion_daemon( + "minion", + defaults=config_defaults, + overrides=config_overrides, + extra_cli_arguments_after_first_start_failure=["--log-level=info"], + ) + return factory diff --git a/tests/pytests/scenarios/dns/multimaster/conftest.py b/tests/pytests/scenarios/dns/multimaster/conftest.py new file mode 100644 index 00000000000..3b50ed65c60 --- /dev/null +++ b/tests/pytests/scenarios/dns/multimaster/conftest.py @@ -0,0 +1,119 @@ +import logging +import os +import shutil +import subprocess + +import pytest + +log = logging.getLogger(__name__) + + +@pytest.fixture(scope="package") +def salt_mm_master_1(request, salt_factories): + + subprocess.check_output(["ip", "addr", "add", "172.16.0.1/32", "dev", "lo"]) + + config_defaults = { + "open_mode": True, + "transport": request.config.getoption("--transport"), + } + config_overrides = { + "interface": "0.0.0.0", + "master_sign_pubkey": True, + } + factory = salt_factories.salt_master_daemon( + "mm-master-1", + defaults=config_defaults, + overrides=config_overrides, + extra_cli_arguments_after_first_start_failure=["--log-level=info"], + ) + try: + with factory.started(start_timeout=180): + yield factory + finally: + + try: + subprocess.check_output(["ip", "addr", "del", "172.16.0.1/32", "dev", "lo"]) + except subprocess.CalledProcessError: + pass + + +@pytest.fixture(scope="package") +def mm_master_1_salt_cli(salt_mm_master_1): + return salt_mm_master_1.salt_cli(timeout=180) + + +@pytest.fixture(scope="package") +def salt_mm_master_2(salt_factories, salt_mm_master_1): + # if salt.utils.platform.is_darwin() or salt.utils.platform.is_freebsd(): + # subprocess.check_output(["ifconfig", "lo0", "alias", "127.0.0.2", "up"]) + + config_defaults = { + "open_mode": True, + "transport": salt_mm_master_1.config["transport"], + } + config_overrides = { + "interface": "0.0.0.0", + "master_sign_pubkey": True, + } + + # Use the same ports for both masters, they are binding to different interfaces + for key in ( + "ret_port", + "publish_port", + ): + config_overrides[key] = salt_mm_master_1.config[key] + 1 + factory = salt_factories.salt_master_daemon( + "mm-master-2", + defaults=config_defaults, + overrides=config_overrides, + extra_cli_arguments_after_first_start_failure=["--log-level=info"], + ) + + # Both masters will share the same signing key pair + for keyfile in ("master_sign.pem", "master_sign.pub"): + shutil.copyfile( + os.path.join(salt_mm_master_1.config["pki_dir"], keyfile), + os.path.join(factory.config["pki_dir"], keyfile), + ) + with factory.started(start_timeout=180): + yield factory + + +@pytest.fixture(scope="package") +def mm_master_2_salt_cli(salt_mm_master_2): + return salt_mm_master_2.salt_cli(timeout=180) + + +@pytest.fixture(scope="package") +def salt_mm_minion_1(salt_mm_master_1, salt_mm_master_2, master_alive_interval): + config_defaults = { + "transport": salt_mm_master_1.config["transport"], + } + + mm_master_1_port = salt_mm_master_1.config["ret_port"] + mm_master_2_port = salt_mm_master_2.config["ret_port"] + config_overrides = { + "master": [ + f"master1.local:{mm_master_1_port}", + f"master2.local:{mm_master_2_port}", + ], + "publish_port": salt_mm_master_1.config["publish_port"], + "master_alive_interval": master_alive_interval, + "master_tries": -1, + "verify_master_pubkey_sign": True, + "retry_dns": True, + } + factory = salt_mm_master_1.salt_minion_daemon( + "mm-minion-1", + defaults=config_defaults, + overrides=config_overrides, + extra_cli_arguments_after_first_start_failure=["--log-level=info"], + ) + # Need to grab the public signing key from the master, either will do + shutil.copyfile( + os.path.join(salt_mm_master_1.config["pki_dir"], "master_sign.pub"), + os.path.join(factory.config["pki_dir"], "master_sign.pub"), + ) + # with factory.started(start_timeout=180): + yield factory diff --git a/tests/pytests/scenarios/dns/multimaster/test_dns.py b/tests/pytests/scenarios/dns/multimaster/test_dns.py new file mode 100644 index 00000000000..5e0fc4c80f7 --- /dev/null +++ b/tests/pytests/scenarios/dns/multimaster/test_dns.py @@ -0,0 +1,50 @@ +import logging +import subprocess +import time + +import pytest + +log = logging.getLogger(__name__) + + +@pytest.mark.skip_unless_on_linux +def test_multimaster_dns( + salt_mm_master_1, + salt_mm_minion_1, + mm_master_1_salt_cli, + etc_hosts, + caplog, + master_alive_interval, +): + """ + Verify a minion configured with multimaster hot/hot will pick up a master's + dns change if it's been disconnected. + """ + + etc_hosts.write_text( + f"{etc_hosts.orig_text}\n172.16.0.1 master1.local master2.local" + ) + + log.info("Added hosts record for master1.local and master2.local") + + with salt_mm_minion_1.started(start_timeout=180): + with caplog.at_level(logging.INFO): + ret = mm_master_1_salt_cli.run("test.ping", minion_tgt="mm-minion-1") + assert ret.returncode == 0 + etc_hosts.write_text( + f"{etc_hosts.orig_text}\n127.0.0.1 master1.local master2.local" + ) + log.info("Changed hosts record for master1.local and master2.local") + subprocess.check_output(["ip", "addr", "del", "172.16.0.1/32", "dev", "lo"]) + log.info("Removed secondary master IP address.") + # Wait for the minion's master_alive_interval, adding a second for + # reliablity. + time.sleep(master_alive_interval + 1) + assert ( + "Master ip address changed from 172.16.0.1 to 127.0.0.1" in caplog.text + ) + ret = mm_master_1_salt_cli.run("test.ping", minion_tgt="mm-minion-1") + assert ret.returncode == 0 + assert ( + "Master ip address changed from 172.16.0.1 to 127.0.0.1" in caplog.text + ) diff --git a/tests/pytests/scenarios/dns/test_dns.py b/tests/pytests/scenarios/dns/test_dns.py new file mode 100644 index 00000000000..df19d928c0f --- /dev/null +++ b/tests/pytests/scenarios/dns/test_dns.py @@ -0,0 +1,33 @@ +import logging +import subprocess +import time + +import pytest + +log = logging.getLogger(__name__) + + +@pytest.mark.skip_unless_on_linux +def test_dns_change(master, minion, salt_cli, etc_hosts, caplog, master_alive_interval): + """ + Verify a minion will pick up a master's dns change if it's been disconnected. + """ + + etc_hosts.write_text(f"{etc_hosts.orig_text}\n172.16.0.1 master.local") + + with minion.started(start_timeout=180): + with caplog.at_level(logging.INFO): + ret = salt_cli.run("test.ping", minion_tgt="minion") + assert ret.returncode == 0 + etc_hosts.write_text(f"{etc_hosts.orig_text}\n127.0.0.1 master.local") + log.info("Changed hosts record for master1.local and master2.local") + subprocess.check_output(["ip", "addr", "del", "172.16.0.1/32", "dev", "lo"]) + log.info("Removed secondary master IP address.") + # Wait for the minion's master_alive_interval, adding a second for + # reliablity. + time.sleep(master_alive_interval + 1) + assert ( + "Master ip address changed from 172.16.0.1 to 127.0.0.1" in caplog.text + ) + ret = salt_cli.run("test.ping", minion_tgt="minion") + assert ret.returncode == 0 diff --git a/tests/pytests/scenarios/failover/multimaster/test_failover_master.py b/tests/pytests/scenarios/failover/multimaster/test_failover_master.py index f661e9ab9a4..84ab7548ff4 100644 --- a/tests/pytests/scenarios/failover/multimaster/test_failover_master.py +++ b/tests/pytests/scenarios/failover/multimaster/test_failover_master.py @@ -159,10 +159,6 @@ def test_minions_alive_with_no_master( """ Make sure the minions stay alive after all masters have stopped. """ - if grains["os_family"] == "Debian" and grains["osmajorrelease"] == 9: - pytest.skip( - "Skipping on Debian 9 until flaky issues resolved. See issue #61749" - ) start_time = time.time() with salt_mm_failover_master_1.stopped(): with salt_mm_failover_master_2.stopped():