Skip to content

Commit

Permalink
Dev: sbd: Adjust timeout related values
Browse files Browse the repository at this point in the history
* Consolidate sbd timeout related methods/constants/formulas into class SBDTimeout

* Adjust stonith-timeout value, formulas are:
  stonith-timeout >= 1.2 * (pcmk_delay_max + msgwait)  # for disk-based sbd
  stonith-timeout >= 1.2 * max(stonith_watchdog_timeout, 2*SBD_WATCHDOG_TIMEOUT)  # for disk-less sbd
  stonith-timeout >= max(STONITH_TIMEOUT_DEFAULT, token+consensus)  # for all situations

* Adjust SBD_DELAY_START value, formulas are:
  SBD_DELAY_START = no # for non virtualization environment or non-2node cluster, which is the system default
  SBD_DELAY_START >= (token + consensus + pcmk_delay_max + msgwait)  # for disk-based sbd
  SBD_DELAY_START >= (token + consensus + 2*SBD_WATCHDOG_TIMEOUT) # for disk-less sbd

* pcmk_delay_max=30 # only for the single stonith device in the 2-node cluster without qdevice
  pcmk_delay_max deletion # only for the single stonith device, not in the 2-node cluster without qdevice
  • Loading branch information
liangxin1300 committed Dec 2, 2021
1 parent caa0b2a commit c4ab4ab
Show file tree
Hide file tree
Showing 6 changed files with 424 additions and 190 deletions.
57 changes: 41 additions & 16 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from . import tmpfiles
from . import lock
from . import userdir
from .constants import SSH_OPTION, QDEVICE_HELP_INFO, CRM_MON_ONE_SHOT
from .constants import SSH_OPTION, QDEVICE_HELP_INFO, CRM_MON_ONE_SHOT, STONITH_TIMEOUT_DEFAULT
from . import ocfs2
from . import qdevice
from . import log
Expand Down Expand Up @@ -63,13 +63,14 @@
BOOTH_DIR = "/etc/booth"
BOOTH_CFG = "/etc/booth/booth.conf"
BOOTH_AUTH = "/etc/booth/authkey"
SBD_SYSTEMD_DELAY_START_DIR = "/etc/systemd/system/sbd.service.d"
FILES_TO_SYNC = (BOOTH_DIR, corosync.conf(), COROSYNC_AUTH, CSYNC2_CFG, CSYNC2_KEY, "/etc/ctdb/nodes",
"/etc/drbd.conf", "/etc/drbd.d", "/etc/ha.d/ldirectord.cf", "/etc/lvm/lvm.conf", "/etc/multipath.conf",
"/etc/samba/smb.conf", SYSCONFIG_NFS, SYSCONFIG_PCMK, SYSCONFIG_SBD, PCMK_REMOTE_AUTH, WATCHDOG_CFG,
PROFILES_FILE, CRM_CFG)

PROFILES_FILE, CRM_CFG, SBD_SYSTEMD_DELAY_START_DIR)
INIT_STAGES = ("ssh", "ssh_remote", "csync2", "csync2_remote", "corosync", "sbd", "cluster", "ocfs2", "admin", "qdevice")


class QdevicePolicy(Enum):
QDEVICE_RELOAD = 0
QDEVICE_RESTART = 1
Expand Down Expand Up @@ -698,12 +699,14 @@ def start_pacemaker(node_list=[]):
Start pacemaker service with wait time for sbd
When node_list set, start pacemaker service in parallel
"""
from .sbd import SBDManager
from .sbd import SBDTimeout
pacemaker_start_msg = "Starting pacemaker"
if utils.package_is_installed("sbd") and \
# not _context means not in init or join process
if not _context and \
utils.package_is_installed("sbd") and \
utils.service_is_enabled("sbd.service") and \
SBDManager.is_delay_start():
pacemaker_start_msg += "(waiting for sbd {}s)".format(SBDManager.get_suitable_sbd_systemd_timeout())
SBDTimeout.is_sbd_delay_start():
pacemaker_start_msg += "(delaying start of sbd for {}s)".format(SBDTimeout.get_sbd_delay_start_sec_from_sysconfig())
with logger_utils.status_long(pacemaker_start_msg):
utils.start_service("pacemaker.service", enable=True, node_list=node_list)

Expand Down Expand Up @@ -1237,7 +1240,7 @@ def init_cluster():
rsc_defaults rsc-options: resource-stickiness=1 migration-threshold=3
""")

_context.sbd_manager.configure_sbd_resource()
_context.sbd_manager.configure_sbd_resource_and_properties()


def init_admin():
Expand Down Expand Up @@ -1334,20 +1337,17 @@ def init_qdevice():
utils.disable_service("corosync-qdevice.service")
return
if _context.stage == "qdevice":
from .sbd import SBDManager
from .sbd import SBDManager, SBDTimeout
utils.check_all_nodes_reachable()
using_diskless_sbd = SBDManager.is_using_diskless_sbd()
_context.qdevice_reload_policy = evaluate_qdevice_quorum_effect(QDEVICE_ADD, using_diskless_sbd)
# add qdevice after diskless sbd started
if using_diskless_sbd:
res = SBDManager.get_sbd_value_from_config("SBD_WATCHDOG_TIMEOUT")
if res:
sbd_watchdog_timeout = max(int(res), SBDManager.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE)
else:
sbd_watchdog_timeout = SBDManager.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE
stonith_timeout = SBDManager.calculate_stonith_timeout(sbd_watchdog_timeout)
SBDManager.update_configuration({"SBD_WATCHDOG_TIMEOUT": str(sbd_watchdog_timeout)})
invokerc("crm configure property stonith-watchdog-timeout=-1 stonith-timeout={}s".format(stonith_timeout))
if not res or int(res) < SBDTimeout.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE:
sbd_watchdog_timeout_qdevice = SBDTimeout.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE
SBDManager.update_configuration({"SBD_WATCHDOG_TIMEOUT": str(sbd_watchdog_timeout_qdevice)})
utils.set_property(stonith_timeout=SBDTimeout.get_stonith_timeout())

logger.info("""Configure Qdevice/Qnetd:""")
qdevice_inst = _context.qdevice_inst
Expand Down Expand Up @@ -1822,6 +1822,14 @@ def update_nodeid(nodeid, node=None):
# attempt to join the cluster failed)
init_cluster_local()

if utils.service_is_active("sbd.service"):
from .sbd import SBDTimeout
SBDTimeout.adjust_sbd_timeout_related_cluster_configuration()
else:
value = get_stonith_timeout_generally_expected()
if value:
utils.set_property_conditionally("stonith-timeout", value)

with logger_utils.status_long("Reloading cluster configuration"):

if ipv6_flag and not is_unicast:
Expand Down Expand Up @@ -1927,6 +1935,10 @@ def remove_node_from_cluster():
"""
Remove node from running cluster and the corosync / pacemaker configuration.
"""
if utils.service_is_active("sbd.service"):
from .sbd import SBDTimeout
SBDTimeout.adjust_sbd_timeout_related_cluster_configuration(removing=True)

node = _context.cluster_node
set_cluster_node_ip()

Expand Down Expand Up @@ -2392,4 +2404,17 @@ def bootstrap_arbitrator(context):
logger.info("Enabling and starting the booth arbitrator service")
utils.start_service("booth@booth", enable=True)


def get_stonith_timeout_generally_expected():
"""
Adjust stonith-timeout for all scenarios, formula is:
stonith-timeout >= max(STONITH_TIMEOUT_DEFAULT, token+consensus)
"""
stonith_enabled = utils.get_property("stonith-enabled")
# When stonith disabled, return
if utils.is_boolean_false(stonith_enabled):
return None

return max(STONITH_TIMEOUT_DEFAULT, corosync.token_and_consensus_timeout())
# EOF
2 changes: 2 additions & 0 deletions crmsh/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,4 +524,6 @@
"""
STANDBY_NV_RE = r'(<nvpair.*{node_id}.*name="standby".*)value="{value}"(.*)'
CRM_MON_ONE_SHOT = "crm_mon -1"
STONITH_TIMEOUT_DEFAULT = 60
PCMK_DELAY_MAX = 30
# vim:ts=4:sw=4:et:
39 changes: 39 additions & 0 deletions crmsh/corosync.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
logger = log.setup_logger(__name__)


COROSYNC_TOKEN_DEFAULT = 1000 # in ms units


def conf():
return os.getenv('COROSYNC_MAIN_CONFIG_FILE', '/etc/corosync/corosync.conf')

Expand Down Expand Up @@ -762,3 +765,39 @@ def create_configuration(clustername="hacluster",
_COROSYNC_CONF_TEMPLATE_RING_ALL + \
_COROSYNC_CONF_TEMPLATE_TAIL
utils.str2file(_COROSYNC_CONF_TEMPLATE % config_common, conf())


def get_corosync_value(key):
"""
Get corosync configuration value from corosync-cmapctl or corosync.conf
"""
try:
out = utils.get_stdout_or_raise_error("corosync-cmapctl {}".format(key))
res = re.search(r'{}\s+.*=\s+(.*)'.format(key), out)
return res.group(1) if res else None
except ValueError:
out = get_value(key)
return out


def get_corosync_value_dict():
"""
Get corosync value, then return these values as dict
"""
value_dict = {}

token = get_corosync_value("totem.token")
value_dict["token"] = int(int(token)/1000) if token else int(COROSYNC_TOKEN_DEFAULT/1000)

consensus = get_corosync_value("totem.consensus")
value_dict["consensus"] = int(int(consensus)/1000) if consensus else int(value_dict["token"]*1.2)

return value_dict


def token_and_consensus_timeout():
"""
Get corosync token plus consensus timeout
"""
_dict = get_corosync_value_dict()
return _dict["token"] + _dict["consensus"]
18 changes: 3 additions & 15 deletions crmsh/crash_test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,40 +98,28 @@ def json_dumps():
os.fsync(f)


def get_property(name):
"""
Get cluster properties
"""
cmd = "crm configure get_property " + name
rc, stdout, _ = crmshutils.get_stdout_stderr(cmd)
if rc != 0:
return None
else:
return stdout


class FenceInfo(object):
"""
Class to collect fence info
"""
@property
def fence_enabled(self):
enable_result = get_property("stonith-enabled")
enable_result = crmshutils.get_property("stonith-enabled")
if not enable_result or enable_result.lower() != "true":
return False
return True

@property
def fence_action(self):
action_result = get_property("stonith-action")
action_result = crmshutils.get_property("stonith-action")
if action_result is None or action_result not in ["off", "poweroff", "reboot"]:
msg_error("Cluster property \"stonith-action\" should be reboot|off|poweroff")
return None
return action_result

@property
def fence_timeout(self):
timeout_result = get_property("stonith-timeout")
timeout_result = crmshutils.get_property("stonith-timeout")
if timeout_result and re.match(r'[1-9][0-9]*(s|)$', timeout_result):
return timeout_result.strip("s")
return config.FENCE_TIMEOUT
Expand Down
Loading

0 comments on commit c4ab4ab

Please sign in to comment.