-
Notifications
You must be signed in to change notification settings - Fork 94
Support --all or specific node to manage cluster and nodes #797
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ba08299
f54bc6e
3b6fa88
923330c
bf12c2d
0f3ebd1
4022b37
827cb5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,13 +29,60 @@ def parse_options(parser, args): | |
options, args = parser.parse_known_args(list(args)) | ||
except: | ||
return None, None | ||
if options.help: | ||
if hasattr(options, 'help') and options.help: | ||
parser.print_help() | ||
return None, None | ||
utils.check_space_option_value(options) | ||
return options, args | ||
|
||
|
||
def parse_option_for_nodes(context, *args): | ||
""" | ||
Parse option for nodes | ||
Return a node list | ||
""" | ||
action_type = context.get_command_name() | ||
action_target = "node" if action_type in ["standby", "online"] else "cluster service" | ||
action = "{} {}".format(action_type, action_target) | ||
usage_template = """ | ||
Specify node(s) on which to {action}. | ||
If no nodes are specified, {action} on the local node. | ||
If --all is specified, {action} on all nodes.""" | ||
addtion_usage = "" | ||
if action_type == "standby": | ||
usage_template += """ | ||
\n\nAdditionally, you may specify a lifetime for the standby---if set to | ||
"reboot", the node will be back online once it reboots. "forever" will | ||
keep the node in standby after reboot. The life time defaults to | ||
"forever".""" | ||
addtion_usage = " [lifetime]" | ||
|
||
parser = ArgParser(description=usage_template.format(action=action), | ||
usage="{} [--all | <node>... ]{}".format(action_type, addtion_usage), | ||
add_help=False, | ||
formatter_class=RawDescriptionHelpFormatter) | ||
parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message") | ||
parser.add_argument("--all", help="To {} on all nodes".format(action), action="store_true", dest="all") | ||
|
||
options, args = parse_options(parser, args) | ||
if options is None or args is None: | ||
raise utils.TerminateSubCommand | ||
if options.all and args: | ||
context.fatal_error("Should either use --all or specific node(s)") | ||
|
||
# return local node | ||
if not options.all and not args: | ||
return [utils.this_node()] | ||
member_list = utils.list_cluster_nodes() | ||
if not member_list: | ||
context.fatal_error("Cannot get the node list from cluster") | ||
for node in args: | ||
if node not in member_list: | ||
context.fatal_error("Node \"{}\" is not a cluster node".format(node)) | ||
# return node list | ||
return member_list if options.all else args | ||
|
||
|
||
def _remove_completer(args): | ||
try: | ||
n = utils.list_cluster_nodes() | ||
|
@@ -91,78 +138,94 @@ def __init__(self): | |
self._inventory_target = None | ||
|
||
@command.skill_level('administrator') | ||
def do_start(self, context): | ||
def do_start(self, context, *args): | ||
''' | ||
Starts the cluster services on this node | ||
Starts the cluster services on all nodes or specific node(s) | ||
''' | ||
try: | ||
if utils.service_is_active("pacemaker.service"): | ||
logger.info("Cluster services already started") | ||
return | ||
bootstrap.start_pacemaker() | ||
if utils.is_qdevice_configured(): | ||
utils.start_service("corosync-qdevice") | ||
logger.info("Cluster services started") | ||
except IOError as err: | ||
context.fatal_error(str(err)) | ||
|
||
# TODO: optionally start services on all nodes or specific node | ||
node_list = parse_option_for_nodes(context, *args) | ||
for node in node_list[:]: | ||
if utils.service_is_active("pacemaker.service", remote_addr=node): | ||
logger.info("Cluster services already started on {}".format(node)) | ||
node_list.remove(node) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pacemaker.service being active doesn't necessarily mean corosync-qdevice.service is active as well, right? Should corosync-qdevice be checked for the full set of nodes as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When using bootstrap to setup cluster with qdevice, qdevice service will be enabled, so after reboot, corosync-qdevice will be started There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "node_list.remove(node)" can be wrong in the use case of normal stop/start, eg. when pacemaker.service is up but qdevice.service is not in the mean time. No necessary in the case of reboot. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Improve this in #898 |
||
if not node_list: | ||
return | ||
|
||
if utils.is_qdevice_configured(): | ||
utils.start_service("corosync-qdevice", node_list=node_list) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although there's no dependencies between corosync-qdevice and pacemaker, it'd be better to perform start of corosync-qdevice before pacemaker. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given that corosync-qdevice.service has these defined as well:
I'd expect pure start of corosync-qdevice.service will resolve the dependency as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can get the point to move corosync-qdevice.service start operation before pacemaker. There is no harm to have multiple systemctl start, in theory. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
bootstrap.start_pacemaker(node_list) | ||
for node in node_list: | ||
logger.info("Cluster services started on {}".format(node)) | ||
|
||
@command.skill_level('administrator') | ||
def do_stop(self, context): | ||
def do_stop(self, context, *args): | ||
''' | ||
Stops the cluster services on this node | ||
Stops the cluster services on all nodes or specific node(s) | ||
''' | ||
try: | ||
if not utils.service_is_active("corosync.service"): | ||
logger.info("Cluster services already stopped") | ||
return | ||
if utils.service_is_active("corosync-qdevice"): | ||
utils.stop_service("corosync-qdevice") | ||
utils.stop_service("corosync") | ||
logger.info("Cluster services stopped") | ||
except IOError as err: | ||
context.fatal_error(str(err)) | ||
|
||
# TODO: optionally stop services on all nodes or specific node | ||
node_list = parse_option_for_nodes(context, *args) | ||
for node in node_list[:]: | ||
if not utils.service_is_active("corosync.service", remote_addr=node): | ||
if utils.service_is_active("sbd.service", remote_addr=node): | ||
utils.stop_service("corosync", remote_addr=node) | ||
logger.info("Cluster services stopped on {}".format(node)) | ||
else: | ||
logger.info("Cluster services already stopped on {}".format(node)) | ||
node_list.remove(node) | ||
if not node_list: | ||
return | ||
|
||
bootstrap.wait_for_cluster("Waiting for {} online".format(' '.join(node_list)), node_list) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there special purpose of inserting this into the stop procedure and waiting for the listed nodes to be online? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I can understand the wait experience in bootstrap. However, this triggers me to think probably no necessary to force user to wait It makes sense to let For those scripts do want to 'wait', before we implement '--wait' option, the following example steps could help Well, my idea is debatable as the different flavor of the user experience. It is not a critical one I think. |
||
|
||
# When dlm configured and quorum is lost, before stop cluster service, should set | ||
# enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option | ||
liangxin1300 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if utils.is_dlm_configured(node_list[0]) and not utils.is_quorate(node_list[0]): | ||
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm") | ||
utils.set_dlm_option(peer=node_list[0], enable_quorum_fencing=0, enable_quorum_lockspace=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC, both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, both. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The biggest concern is, given that this cluster partition is inquorate, there might a quorate partition standing... For example this partition has been split and inquorate, but somehow hasn't been fenced. If we are shutting down this cluster partition, and in case these settings make the inquorate partition be able to acquire the access to lockspace and corrupt data, that would be a disaster. We should never sacrifice data integrity even for graceful shutdown... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. enable_quorum_lockspace is disabled, this will make dlm lockspace related operation can keep going when the cluster quorum is lost. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. your comment makes sense to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking about it on the more safe/paranoid side, even if the user has confirmed to proceed, but allowing this simultaneously on multiple nodes is more like opening a Pandora's box... Since right after that, during stop, this cluster partition might spit apart into even more partitions... If we go for it, we could ask user once, but we'd better proceed this specific procedure the serialized way: so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My previous point stays with the last standing single node situation. Given that the node is inquorate already, the proposed behavior for Well, for the situation with multiple inquorate nodes, aka. multiple partitions, then the code here do have problem for '--all' situation. Simply because set_dlm_option only applies to one node. Not sure, if it is simple enough to address it in this PR, or open an issue to clarify this in another PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. set_dlm_option is implemented by "dlm_tool set_config", which only run on a single node each time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Fine. Given the above situation, when all nodes are inquorate, what's the suggested graceful shutdown steps internally for "stop --all" ? My reading of the code is it only change the current local node, no repeat the same on other nodes. The situation is more fun, in theory for a big cluster, some nodes are quorate, some are not. Agree, it is a transient corner case. What's the suggested internal steps for "stop --all"? @zhaohem There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe crmsh should never operate a cluster in the transient state at all by default? And ask user to answer Y/n, or use --force ? |
||
|
||
# Stop pacemaker since it can make sure cluster has quorum until stop corosync | ||
utils.stop_service("pacemaker", node_list=node_list) | ||
# Then, stop qdevice if is active | ||
if utils.service_is_active("corosync-qdevice.service", node_list[0]): | ||
utils.stop_service("corosync-qdevice.service", node_list=node_list) | ||
# Last, stop corosync | ||
utils.stop_service("corosync", node_list=node_list) | ||
|
||
for node in node_list: | ||
logger.info("Cluster services stopped on {}".format(node)) | ||
|
||
@command.skill_level('administrator') | ||
def do_restart(self, context): | ||
def do_restart(self, context, *args): | ||
''' | ||
Restarts the cluster services on all nodes or specific node(s) | ||
''' | ||
parse_option_for_nodes(context, *args) | ||
self.do_stop(context, *args) | ||
self.do_start(context, *args) | ||
|
||
def _enable_disable_common(self, context, *args): | ||
''' | ||
Restarts the cluster services on this node | ||
Common part for enable and disable | ||
''' | ||
self.do_stop(context) | ||
self.do_start(context) | ||
node_list = parse_option_for_nodes(context, *args) | ||
action = context.get_command_name() | ||
utils.cluster_run_cmd("systemctl {} pacemaker.service".format(action), node_list) | ||
if utils.is_qdevice_configured(): | ||
utils.cluster_run_cmd("systemctl {} corosync-qdevice.service".format(action), node_list) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically maybe it should be able to disable corosync-qdevice.service even if qdevice is not configured? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. corosync-qdevice.service will not be started if not configured, right? So I think to check if configured then do the action will no harm? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it depends on how is_qdevice_configured() does the check. Of course one could remove qdevice configuration from corosync.conf before stopping the running corosync-qdevice.service... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I think I would naturally expect There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PR to improve this: #895 |
||
for node in node_list: | ||
logger.info("Cluster services %s on %s", action+'d', node) | ||
|
||
@command.skill_level('administrator') | ||
def do_enable(self, context): | ||
def do_enable(self, context, *args): | ||
''' | ||
Enable the cluster services on this node | ||
''' | ||
try: | ||
utils.enable_service("pacemaker.service") | ||
if utils.is_qdevice_configured(): | ||
utils.enable_service("corosync-qdevice.service") | ||
logger.info("Cluster services enabled") | ||
except IOError as err: | ||
context.fatal_error(str(err)) | ||
|
||
# TODO: optionally enable services on all nodes or specific node | ||
self._enable_disable_common(context, *args) | ||
|
||
@command.skill_level('administrator') | ||
def do_disable(self, context): | ||
def do_disable(self, context, *args): | ||
''' | ||
Disable the cluster services on this node | ||
''' | ||
try: | ||
utils.disable_service("pacemaker.service") | ||
if utils.is_qdevice_configured(): | ||
utils.disable_service("corosync-qdevice.service") | ||
logger.info("Cluster services disabled") | ||
except IOError as err: | ||
context.fatal_error(str(err)) | ||
|
||
# TODO: optionally disable services on all nodes or specific node | ||
self._enable_disable_common(context, *args) | ||
|
||
def _args_implicit(self, context, args, name): | ||
''' | ||
|
@@ -663,7 +726,7 @@ def do_wait_for_startup(self, context, timeout='10'): | |
@command.skill_level('expert') | ||
def do_run(self, context, cmd, *nodes): | ||
''' | ||
Execute the given command on all nodes/specific node, report outcome | ||
Execute the given command on all nodes/specific node(s), report outcome | ||
''' | ||
try: | ||
import parallax | ||
|
Uh oh!
There was an error while loading. Please reload this page.