Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev: sbd: Adjust timeout related values #890

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 41 additions & 16 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from . import tmpfiles
from . import lock
from . import userdir
from .constants import SSH_OPTION, QDEVICE_HELP_INFO, CRM_MON_ONE_SHOT
from .constants import SSH_OPTION, QDEVICE_HELP_INFO, CRM_MON_ONE_SHOT, STONITH_TIMEOUT_DEFAULT
from . import ocfs2
from . import qdevice
from . import log
Expand Down Expand Up @@ -63,13 +63,14 @@
BOOTH_DIR = "/etc/booth"
BOOTH_CFG = "/etc/booth/booth.conf"
BOOTH_AUTH = "/etc/booth/authkey"
SBD_SYSTEMD_DELAY_START_DIR = "/etc/systemd/system/sbd.service.d"
FILES_TO_SYNC = (BOOTH_DIR, corosync.conf(), COROSYNC_AUTH, CSYNC2_CFG, CSYNC2_KEY, "/etc/ctdb/nodes",
"/etc/drbd.conf", "/etc/drbd.d", "/etc/ha.d/ldirectord.cf", "/etc/lvm/lvm.conf", "/etc/multipath.conf",
"/etc/samba/smb.conf", SYSCONFIG_NFS, SYSCONFIG_PCMK, SYSCONFIG_SBD, PCMK_REMOTE_AUTH, WATCHDOG_CFG,
PROFILES_FILE, CRM_CFG)

PROFILES_FILE, CRM_CFG, SBD_SYSTEMD_DELAY_START_DIR)
INIT_STAGES = ("ssh", "ssh_remote", "csync2", "csync2_remote", "corosync", "sbd", "cluster", "ocfs2", "admin", "qdevice")


class QdevicePolicy(Enum):
QDEVICE_RELOAD = 0
QDEVICE_RESTART = 1
Expand Down Expand Up @@ -698,12 +699,14 @@ def start_pacemaker(node_list=[]):
Start pacemaker service with wait time for sbd
When node_list set, start pacemaker service in parallel
"""
from .sbd import SBDManager
from .sbd import SBDTimeout
pacemaker_start_msg = "Starting pacemaker"
if utils.package_is_installed("sbd") and \
# not _context means not in init or join process
if not _context and \
utils.package_is_installed("sbd") and \
utils.service_is_enabled("sbd.service") and \
SBDManager.is_delay_start():
pacemaker_start_msg += "(waiting for sbd {}s)".format(SBDManager.get_suitable_sbd_systemd_timeout())
SBDTimeout.is_sbd_delay_start():
pacemaker_start_msg += "(delaying start of sbd for {}s)".format(SBDTimeout.get_sbd_delay_start_sec_from_sysconfig())
with logger_utils.status_long(pacemaker_start_msg):
utils.start_service("pacemaker.service", enable=True, node_list=node_list)

Expand Down Expand Up @@ -1237,7 +1240,7 @@ def init_cluster():
rsc_defaults rsc-options: resource-stickiness=1 migration-threshold=3
""")

_context.sbd_manager.configure_sbd_resource()
_context.sbd_manager.configure_sbd_resource_and_properties()


def init_admin():
Expand Down Expand Up @@ -1334,20 +1337,17 @@ def init_qdevice():
utils.disable_service("corosync-qdevice.service")
return
if _context.stage == "qdevice":
from .sbd import SBDManager
from .sbd import SBDManager, SBDTimeout
utils.check_all_nodes_reachable()
using_diskless_sbd = SBDManager.is_using_diskless_sbd()
_context.qdevice_reload_policy = evaluate_qdevice_quorum_effect(QDEVICE_ADD, using_diskless_sbd)
# add qdevice after diskless sbd started
if using_diskless_sbd:
res = SBDManager.get_sbd_value_from_config("SBD_WATCHDOG_TIMEOUT")
if res:
sbd_watchdog_timeout = max(int(res), SBDManager.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE)
else:
sbd_watchdog_timeout = SBDManager.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE
stonith_timeout = SBDManager.calculate_stonith_timeout(sbd_watchdog_timeout)
SBDManager.update_configuration({"SBD_WATCHDOG_TIMEOUT": str(sbd_watchdog_timeout)})
invokerc("crm configure property stonith-watchdog-timeout=-1 stonith-timeout={}s".format(stonith_timeout))
if not res or int(res) < SBDTimeout.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE:
sbd_watchdog_timeout_qdevice = SBDTimeout.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE
SBDManager.update_configuration({"SBD_WATCHDOG_TIMEOUT": str(sbd_watchdog_timeout_qdevice)})
utils.set_property(stonith_timeout=SBDTimeout.get_stonith_timeout())

logger.info("""Configure Qdevice/Qnetd:""")
qdevice_inst = _context.qdevice_inst
Expand Down Expand Up @@ -1822,6 +1822,14 @@ def update_nodeid(nodeid, node=None):
# attempt to join the cluster failed)
init_cluster_local()

if utils.service_is_active("sbd.service"):
from .sbd import SBDTimeout
SBDTimeout.adjust_sbd_timeout_related_cluster_configuration()
else:
value = get_stonith_timeout_generally_expected()
if value:
utils.set_property_conditionally("stonith-timeout", value)

with logger_utils.status_long("Reloading cluster configuration"):

if ipv6_flag and not is_unicast:
Expand Down Expand Up @@ -1927,6 +1935,10 @@ def remove_node_from_cluster():
"""
Remove node from running cluster and the corosync / pacemaker configuration.
"""
if utils.service_is_active("sbd.service"):
from .sbd import SBDTimeout
SBDTimeout.adjust_sbd_timeout_related_cluster_configuration(removing=True)

node = _context.cluster_node
set_cluster_node_ip()

Expand Down Expand Up @@ -2392,4 +2404,17 @@ def bootstrap_arbitrator(context):
logger.info("Enabling and starting the booth arbitrator service")
utils.start_service("booth@booth", enable=True)


def get_stonith_timeout_generally_expected():
"""
Adjust stonith-timeout for all scenarios, formula is:

stonith-timeout = STONITH_TIMEOUT_DEFAULT + token + consensus
"""
stonith_enabled = utils.get_property("stonith-enabled")
# When stonith disabled, return
if utils.is_boolean_false(stonith_enabled):
return None

return STONITH_TIMEOUT_DEFAULT + corosync.token_and_consensus_timeout()
# EOF
2 changes: 2 additions & 0 deletions crmsh/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,4 +524,6 @@
"""
STANDBY_NV_RE = r'(<nvpair.*{node_id}.*name="standby".*)value="{value}"(.*)'
CRM_MON_ONE_SHOT = "crm_mon -1"
STONITH_TIMEOUT_DEFAULT = 60
liangxin1300 marked this conversation as resolved.
Show resolved Hide resolved
PCMK_DELAY_MAX = 30
# vim:ts=4:sw=4:et:
39 changes: 39 additions & 0 deletions crmsh/corosync.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
logger = log.setup_logger(__name__)


COROSYNC_TOKEN_DEFAULT = 1000 # in ms units
liangxin1300 marked this conversation as resolved.
Show resolved Hide resolved


def conf():
return os.getenv('COROSYNC_MAIN_CONFIG_FILE', '/etc/corosync/corosync.conf')

Expand Down Expand Up @@ -762,3 +765,39 @@ def create_configuration(clustername="hacluster",
_COROSYNC_CONF_TEMPLATE_RING_ALL + \
_COROSYNC_CONF_TEMPLATE_TAIL
utils.str2file(_COROSYNC_CONF_TEMPLATE % config_common, conf())


def get_corosync_value(key):
"""
Get corosync configuration value from corosync-cmapctl or corosync.conf
"""
try:
out = utils.get_stdout_or_raise_error("corosync-cmapctl {}".format(key))
res = re.search(r'{}\s+.*=\s+(.*)'.format(key), out)
return res.group(1) if res else None
except ValueError:
out = get_value(key)
return out


def get_corosync_value_dict():
"""
Get corosync value, then return these values as dict
"""
value_dict = {}

token = get_corosync_value("totem.token")
value_dict["token"] = int(int(token)/1000) if token else int(COROSYNC_TOKEN_DEFAULT/1000)

consensus = get_corosync_value("totem.consensus")
value_dict["consensus"] = int(int(consensus)/1000) if consensus else int(value_dict["token"]*1.2)

return value_dict


def token_and_consensus_timeout():
"""
Get corosync token plus consensus timeout
"""
_dict = get_corosync_value_dict()
return _dict["token"] + _dict["consensus"]
18 changes: 3 additions & 15 deletions crmsh/crash_test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,40 +98,28 @@ def json_dumps():
os.fsync(f)


def get_property(name):
"""
Get cluster properties
"""
cmd = "crm configure get_property " + name
rc, stdout, _ = crmshutils.get_stdout_stderr(cmd)
if rc != 0:
return None
else:
return stdout


class FenceInfo(object):
"""
Class to collect fence info
"""
@property
def fence_enabled(self):
enable_result = get_property("stonith-enabled")
enable_result = crmshutils.get_property("stonith-enabled")
if not enable_result or enable_result.lower() != "true":
return False
return True

@property
def fence_action(self):
action_result = get_property("stonith-action")
action_result = crmshutils.get_property("stonith-action")
if action_result is None or action_result not in ["off", "poweroff", "reboot"]:
msg_error("Cluster property \"stonith-action\" should be reboot|off|poweroff")
return None
return action_result

@property
def fence_timeout(self):
timeout_result = get_property("stonith-timeout")
timeout_result = crmshutils.get_property("stonith-timeout")
if timeout_result and re.match(r'[1-9][0-9]*(s|)$', timeout_result):
return timeout_result.strip("s")
return config.FENCE_TIMEOUT
Expand Down
Loading