ClusterLabs · liangxin1300 · Nov 16, 2021 · Jun 1, 2021 · Nov 4, 2021 · Nov 9, 2021
diff --git a/crmsh/bootstrap.py b/crmsh/bootstrap.py
@@ -33,7 +33,7 @@
 from . import tmpfiles
 from . import lock
 from . import userdir
-from .constants import SSH_OPTION, QDEVICE_HELP_INFO
+from .constants import SSH_OPTION, QDEVICE_HELP_INFO, CRM_MON_ONE_SHOT
 from . import ocfs2
 from . import qdevice
 from . import log
@@ -363,36 +363,52 @@ def wait_for_resource(message, resource):
             sleep(1)
 
 
-def wait_for_cluster():
-    with logger_utils.status_long("Waiting for cluster"):
+def wait_for_cluster(message="Waiting for cluster", node_list=[]):
+    """
+    Wait for local node or specific node(s) online
+    """
+    # Sleep here since just after pacemaker.service started, crm_mon might not ready
+    sleep(2)
+    # Check if already online
+    if is_online(node_list):
+        return
+
+    with logger_utils.status_long(message):
         while True:
-            _rc, out, _err = utils.get_stdout_stderr("crm_mon -1")
-            if is_online(out):
+            if is_online(node_list):
                 break
             status_progress()
             sleep(2)
+    # Sleep here since when do_stop function calling wait_for_cluster,
+    # just after nodes online, if no sleep to wait, some nodes might hang with pending
+    sleep(2)
 
 
 def get_cluster_node_hostname():
     """
     Get the hostname of the cluster node
     """
     peer_node = None
-    if _context.cluster_node:
+    if _context and _context.cluster_node:
         rc, out, err = utils.get_stdout_stderr("ssh {} {} crm_node --name".format(SSH_OPTION, _context.cluster_node))
         if rc != 0:
             utils.fatal(err)
         peer_node = out
     return peer_node
 
 
-def is_online(crm_mon_txt):
+def is_online(node_list=[]):
     """
-    Check whether local node is online
+    Check whether local node is online, or specific node(s) online
     Besides that, in join process, check whether init node is online
     """
-    if not re.search("Online: .* {} ".format(utils.this_node()), crm_mon_txt):
-        return False
+    _list = node_list if node_list else [utils.this_node()]
+    crm_mon_txt = utils.get_stdout_or_raise_error(CRM_MON_ONE_SHOT, remote=_list[0])
+    # Make sure all nodes online
+    # TODO how about the shutdown node?
+    for node in _list:
+        if not re.search(r'Online:\s+\[.*{}\s+.*'.format(node), crm_mon_txt):
+            return False
 
     # if peer_node is None, this is in the init process
     peer_node = get_cluster_node_hostname()
@@ -677,9 +693,10 @@ def init_cluster_local():
     wait_for_cluster()
 
 
-def start_pacemaker():
+def start_pacemaker(node_list=[]):
     """
     Start pacemaker service with wait time for sbd
+    When node_list set, start pacemaker service in parallel
     """
     from .sbd import SBDManager
     pacemaker_start_msg = "Starting pacemaker"
@@ -688,7 +705,7 @@ def start_pacemaker():
             SBDManager.is_delay_start():
         pacemaker_start_msg += "(waiting for sbd {}s)".format(SBDManager.get_suitable_sbd_systemd_timeout())
     with logger_utils.status_long(pacemaker_start_msg):
-        utils.start_service("pacemaker.service", enable=True)
+        utils.start_service("pacemaker.service", enable=True, node_list=node_list)
 
 
 def install_tmp(tmpfile, to):
@@ -1263,7 +1280,7 @@ def evaluate_qdevice_quorum_effect(mode, diskless_sbd=False):
     elif mode == QDEVICE_REMOVE:
         actual_votes -= 1
 
-    if utils.is_quorate(expected_votes, actual_votes) and not diskless_sbd:
+    if utils.calculate_quorate_status(expected_votes, actual_votes) and not diskless_sbd:
         # safe to use reload
         return QdevicePolicy.QDEVICE_RELOAD
     elif utils.has_resource_running():

diff --git a/crmsh/constants.py b/crmsh/constants.py
@@ -501,4 +501,27 @@
 YELLOW = '\033[33m'
 GREEN = '\033[32m'
 END = '\033[0m'
+
+
+CIB_QUERY = "cibadmin -Q"
+CIB_REPLACE = "cibadmin -R -X '{xmlstr}'"
+CIB_RAW_FILE = "/var/lib/pacemaker/cib/cib.xml"
+XML_NODE_PATH = "/cib/configuration/nodes/node"
+XML_STATUS_PATH = "/cib/status/node_state"
+XML_NODE_QUERY_STANDBY_PATH = "//nodes/node[@id='{node_id}']/instance_attributes/nvpair[@name='standby']/@value"
+XML_STATUS_QUERY_STANDBY_PATH = "//status/node_state[@id='{node_id}']/transient_attributes/instance_attributes/nvpair[@name='standby']/@value"
+STANDBY_TEMPLATE = """
+<instance_attributes id="nodes-{node_id}">
+  <nvpair id="nodes-{node_id}-standby" name="standby" value="{value}"/>
+</instance_attributes>
+"""
+STANDBY_TEMPLATE_REBOOT = """
+<transient_attributes id="{node_id}">
+  <instance_attributes id="status-{node_id}">
+    <nvpair id="status-{node_id}-standby" name="standby" value="{value}"/>
+  </instance_attributes>
+</transient_attributes>
+"""
+STANDBY_NV_RE = r'(<nvpair.*{node_id}.*name="standby".*)value="{value}"(.*)'
+CRM_MON_ONE_SHOT = "crm_mon -1"
 # vim:ts=4:sw=4:et:
diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py
@@ -29,13 +29,60 @@ def parse_options(parser, args):
         options, args = parser.parse_known_args(list(args))
     except:
         return None, None
-    if options.help:
+    if hasattr(options, 'help') and options.help:
         parser.print_help()
         return None, None
     utils.check_space_option_value(options)
     return options, args
 
 
+def parse_option_for_nodes(context, *args):
+    """
+    Parse option for nodes
+    Return a node list
+    """
+    action_type = context.get_command_name()
+    action_target = "node" if action_type in ["standby", "online"] else "cluster service"
+    action = "{} {}".format(action_type, action_target)
+    usage_template = """
+Specify node(s) on which to {action}.
+If no nodes are specified, {action} on the local node.
+If --all is specified, {action} on all nodes."""
+    addtion_usage = ""
+    if action_type == "standby":
+        usage_template += """
+\n\nAdditionally, you may specify a lifetime for the standby---if set to
+"reboot", the node will be back online once it reboots. "forever" will
+keep the node in standby after reboot. The life time defaults to
+"forever"."""
+        addtion_usage = " [lifetime]"
+
+    parser = ArgParser(description=usage_template.format(action=action),
+            usage="{} [--all | <node>... ]{}".format(action_type, addtion_usage),
+            add_help=False,
+            formatter_class=RawDescriptionHelpFormatter)
+    parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message")
+    parser.add_argument("--all", help="To {} on all nodes".format(action), action="store_true", dest="all")
+
+    options, args = parse_options(parser, args)
+    if options is None or args is None:
+        raise utils.TerminateSubCommand
+    if options.all and args:
+        context.fatal_error("Should either use --all or specific node(s)")
+
+    # return local node
+    if not options.all and not args:
+        return [utils.this_node()]
+    member_list = utils.list_cluster_nodes()
+    if not member_list:
+        context.fatal_error("Cannot get the node list from cluster")
+    for node in args:
+        if node not in member_list:
+            context.fatal_error("Node \"{}\" is not a cluster node".format(node))
+    # return node list
+    return member_list if options.all else args
+
+
 def _remove_completer(args):
     try:
         n = utils.list_cluster_nodes()
@@ -91,78 +138,94 @@ def __init__(self):
         self._inventory_target = None
 
     @command.skill_level('administrator')
-    def do_start(self, context):
+    def do_start(self, context, *args):
         '''
-        Starts the cluster services on this node
+        Starts the cluster services on all nodes or specific node(s)
         '''
-        try:
-            if utils.service_is_active("pacemaker.service"):
-                logger.info("Cluster services already started")
-                return
-            bootstrap.start_pacemaker()
-            if utils.is_qdevice_configured():
-                utils.start_service("corosync-qdevice")
-            logger.info("Cluster services started")
-        except IOError as err:
-            context.fatal_error(str(err))
-
-        # TODO: optionally start services on all nodes or specific node
+        node_list = parse_option_for_nodes(context, *args)
+        for node in node_list[:]:
+            if utils.service_is_active("pacemaker.service", remote_addr=node):
+                logger.info("Cluster services already started on {}".format(node))
+                node_list.remove(node)
+        if not node_list:
+            return
+
+        if utils.is_qdevice_configured():
+            utils.start_service("corosync-qdevice", node_list=node_list)
+        bootstrap.start_pacemaker(node_list)
+        for node in node_list:
+            logger.info("Cluster services started on {}".format(node))
 
     @command.skill_level('administrator')
-    def do_stop(self, context):
+    def do_stop(self, context, *args):
         '''
-        Stops the cluster services on this node
+        Stops the cluster services on all nodes or specific node(s)
         '''
-        try:
-            if not utils.service_is_active("corosync.service"):
-                logger.info("Cluster services already stopped")
-                return
-            if utils.service_is_active("corosync-qdevice"):
-                utils.stop_service("corosync-qdevice")
-            utils.stop_service("corosync")
-            logger.info("Cluster services stopped")
-        except IOError as err:
-            context.fatal_error(str(err))
-
-        # TODO: optionally stop services on all nodes or specific node
+        node_list = parse_option_for_nodes(context, *args)
+        for node in node_list[:]:
+            if not utils.service_is_active("corosync.service", remote_addr=node):
+                if utils.service_is_active("sbd.service", remote_addr=node):
+                    utils.stop_service("corosync", remote_addr=node)
+                    logger.info("Cluster services stopped on {}".format(node))
+                else:
+                    logger.info("Cluster services already stopped on {}".format(node))
+                node_list.remove(node)
+        if not node_list:
+            return
+
+        bootstrap.wait_for_cluster("Waiting for {} online".format(' '.join(node_list)), node_list)
+
+        # When dlm configured and quorum is lost, before stop cluster service, should set
+        # enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
+        if utils.is_dlm_configured(node_list[0]) and not utils.is_quorate(node_list[0]):
+            logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
+            utils.set_dlm_option(peer=node_list[0], enable_quorum_fencing=0, enable_quorum_lockspace=0)
+
+        # Stop pacemaker since it can make sure cluster has quorum until stop corosync
+        utils.stop_service("pacemaker", node_list=node_list)
+        # Then, stop qdevice if is active
+        if utils.service_is_active("corosync-qdevice.service", node_list[0]):
+            utils.stop_service("corosync-qdevice.service", node_list=node_list)
+        # Last, stop corosync
+        utils.stop_service("corosync", node_list=node_list)
+
+        for node in node_list:
+            logger.info("Cluster services stopped on {}".format(node))
 
     @command.skill_level('administrator')
-    def do_restart(self, context):
+    def do_restart(self, context, *args):
+        '''
+        Restarts the cluster services on all nodes or specific node(s)
+        '''
+        parse_option_for_nodes(context, *args)
+        self.do_stop(context, *args)
+        self.do_start(context, *args)
+
+    def _enable_disable_common(self, context, *args):
         '''
-        Restarts the cluster services on this node
+        Common part for enable and disable
         '''
-        self.do_stop(context)
-        self.do_start(context)
+        node_list = parse_option_for_nodes(context, *args)
+        action = context.get_command_name()
+        utils.cluster_run_cmd("systemctl {} pacemaker.service".format(action), node_list)
+        if utils.is_qdevice_configured():
+            utils.cluster_run_cmd("systemctl {} corosync-qdevice.service".format(action), node_list)
+        for node in node_list:
+            logger.info("Cluster services %s on %s", action+'d', node)
 
     @command.skill_level('administrator')
-    def do_enable(self, context):
+    def do_enable(self, context, *args):
         '''
         Enable the cluster services on this node
         '''
-        try:
-            utils.enable_service("pacemaker.service")
-            if utils.is_qdevice_configured():
-                utils.enable_service("corosync-qdevice.service")
-            logger.info("Cluster services enabled")
-        except IOError as err:
-            context.fatal_error(str(err))
-
-        # TODO: optionally enable services on all nodes or specific node
+        self._enable_disable_common(context, *args)
 
     @command.skill_level('administrator')
-    def do_disable(self, context):
+    def do_disable(self, context, *args):
         '''
         Disable the cluster services on this node
         '''
-        try:
-            utils.disable_service("pacemaker.service")
-            if utils.is_qdevice_configured():
-                utils.disable_service("corosync-qdevice.service")
-            logger.info("Cluster services disabled")
-        except IOError as err:
-            context.fatal_error(str(err))
-
-        # TODO: optionally disable services on all nodes or specific node
+        self._enable_disable_common(context, *args)
 
     def _args_implicit(self, context, args, name):
         '''
@@ -663,7 +726,7 @@ def do_wait_for_startup(self, context, timeout='10'):
     @command.skill_level('expert')
     def do_run(self, context, cmd, *nodes):
         '''
-        Execute the given command on all nodes/specific node, report outcome
+        Execute the given command on all nodes/specific node(s), report outcome
         '''
         try:
             import parallax