Skip to content

Commit

Permalink
Minions check dns when re-connecting to a master
Browse files Browse the repository at this point in the history
Check for a chainging dns record anytime a minion gets disconnected from
it's master. See github issue #63654 #61482.
  • Loading branch information
dwoz committed Apr 25, 2024
1 parent b1754ee commit d2c59f4
Showing 1 changed file with 75 additions and 105 deletions.
180 changes: 75 additions & 105 deletions salt/minion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2826,9 +2826,60 @@ def handle_event(self, package):
# we are not connected anymore
self.connected = False
log.info("Connection to master %s lost", self.opts["master"])
if self.opts["transport"] != "tcp":
self.schedule.delete_job(name=master_event(type="alive"))

log.info("Trying to tune in to next master from master-list")

if hasattr(self, "pub_channel"):
self.pub_channel.on_recv(None)
if hasattr(self.pub_channel, "auth"):
self.pub_channel.auth.invalidate()
if hasattr(self.pub_channel, "close"):
self.pub_channel.close()
if hasattr(self, "req_channel") and self.req_channel:
self.req_channel.close()
self.req_channel = None

# if eval_master finds a new master for us, self.connected
# will be True again on successful master authentication
try:
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)
except SaltClientError:
pass

if self.connected:
self.opts["master"] = master

if self.opts["master_type"] != "failover":
# modify the scheduled job to fire on reconnect
# re-init the subsystems to work with the new master
log.info(
"Re-initialising subsystems for new master %s",
self.opts["master"],
)

self.req_channel = salt.channel.client.AsyncReqChannel.factory(
self.opts, io_loop=self.io_loop
)

# put the current schedule into the new loaders
self.opts["schedule"] = self.schedule.option("schedule")
(
self.functions,
self.returners,
self.function_errors,
self.executors,
) = self._load_modules()
# make the schedule to use the new 'functions' loader
self.schedule.functions = self.functions
self.pub_channel.on_recv(self._handle_payload)
self._fire_master_minion_start()
log.info("Minion is ready to receive requests!")

# update scheduled job to run with the new master addr
if self.opts["transport"] != "tcp":
schedule = {
"function": "status.master",
Expand All @@ -2838,116 +2889,35 @@ def handle_event(self, package):
"return_job": False,
"kwargs": {
"master": self.opts["master"],
"connected": False,
"connected": True,
},
}
self.schedule.modify_job(
name=master_event(type="alive", master=self.opts["master"]),
schedule=schedule,
)
else:
# delete the scheduled job to don't interfere with the failover process
if self.opts["transport"] != "tcp":
self.schedule.delete_job(name=master_event(type="alive"))

log.info("Trying to tune in to next master from master-list")

if hasattr(self, "pub_channel"):
self.pub_channel.on_recv(None)
if hasattr(self.pub_channel, "auth"):
self.pub_channel.auth.invalidate()
if hasattr(self.pub_channel, "close"):
self.pub_channel.close()
if hasattr(self, "req_channel") and self.req_channel:
self.req_channel.close()
self.req_channel = None

# if eval_master finds a new master for us, self.connected
# will be True again on successful master authentication
try:
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)
except SaltClientError:
pass

if self.connected:
self.opts["master"] = master

# re-init the subsystems to work with the new master
log.info(
"Re-initialising subsystems for new master %s",
self.opts["master"],
)

self.req_channel = salt.channel.client.AsyncReqChannel.factory(
self.opts, io_loop=self.io_loop
)

# put the current schedule into the new loaders
self.opts["schedule"] = self.schedule.option("schedule")
(
self.functions,
self.returners,
self.function_errors,
self.executors,
) = self._load_modules()
# make the schedule to use the new 'functions' loader
self.schedule.functions = self.functions
self.pub_channel.on_recv(self._handle_payload)
self._fire_master_minion_start()
log.info("Minion is ready to receive requests!")

# update scheduled job to run with the new master addr
if self.opts["transport"] != "tcp":
schedule = {
"function": "status.master",
"seconds": self.opts["master_alive_interval"],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {
"master": self.opts["master"],
"connected": True,
},
}
self.schedule.modify_job(
name=master_event(
type="alive", master=self.opts["master"]
),
schedule=schedule,
)

if (
self.opts["master_failback"]
and "master_list" in self.opts
):
if self.opts["master"] != self.opts["master_list"][0]:
schedule = {
"function": "status.ping_master",
"seconds": self.opts[
"master_failback_interval"
],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {
"master": self.opts["master_list"][0]
},
}
self.schedule.modify_job(
name=master_event(type="failback"),
schedule=schedule,
)
else:
self.schedule.delete_job(
name=master_event(type="failback"), persist=True
)
else:
self.restart = True
self.io_loop.stop()
if self.opts["master_failback"] and "master_list" in self.opts:
if self.opts["master"] != self.opts["master_list"][0]:
schedule = {
"function": "status.ping_master",
"seconds": self.opts["master_failback_interval"],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {"master": self.opts["master_list"][0]},
}
self.schedule.modify_job(
name=master_event(type="failback"),
schedule=schedule,
)
else:
self.schedule.delete_job(
name=master_event(type="failback"), persist=True
)
else:
self.restart = True
self.io_loop.stop()

elif tag.startswith(master_event(type="connected")):
# handle this event only once. otherwise it will pollute the log
Expand Down

0 comments on commit d2c59f4

Please sign in to comment.