|
| 1 | +/* |
| 2 | + * Licensed to Elasticsearch under one or more contributor |
| 3 | + * license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright |
| 5 | + * ownership. Elasticsearch licenses this file to you under |
| 6 | + * the Apache License, Version 2.0 (the "License"); you may |
| 7 | + * not use this file except in compliance with the License. |
| 8 | + * You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +package org.elasticsearch.cluster.coordination; |
| 21 | + |
| 22 | +import org.elasticsearch.cluster.ClusterState; |
| 23 | +import org.elasticsearch.cluster.node.DiscoveryNode; |
| 24 | +import org.elasticsearch.common.component.AbstractComponent; |
| 25 | +import org.elasticsearch.common.settings.ClusterSettings; |
| 26 | +import org.elasticsearch.common.settings.Setting; |
| 27 | +import org.elasticsearch.common.settings.Setting.Property; |
| 28 | +import org.elasticsearch.common.settings.Settings; |
| 29 | +import org.elasticsearch.common.util.set.Sets; |
| 30 | + |
| 31 | +import java.util.Collection; |
| 32 | +import java.util.Set; |
| 33 | +import java.util.TreeSet; |
| 34 | +import java.util.stream.Collectors; |
| 35 | +import java.util.stream.Stream; |
| 36 | + |
| 37 | +/** |
| 38 | + * Computes the optimal configuration of voting nodes in the cluster. |
| 39 | + */ |
| 40 | +public class Reconfigurator extends AbstractComponent { |
| 41 | + |
| 42 | + /** |
| 43 | + * The cluster usually requires a vote from at least half of the master nodes in order to commit a cluster state update, and to achieve |
| 44 | + * this it makes automatic adjustments to the quorum size as master nodes join or leave the cluster. However, if master nodes leave the |
| 45 | + * cluster slowly enough then these automatic adjustments can end up with a single master node; if this last node were to fail then the |
| 46 | + * cluster would be rendered permanently unavailable. Instead it may be preferable to stop processing cluster state updates and become |
| 47 | + * unavailable when the second-last (more generally, n'th-last) node leaves the cluster, so that the cluster is never in a situation |
| 48 | + * where a single node's failure can cause permanent unavailability. This setting determines the size of the smallest set of master |
| 49 | + * nodes required to process a cluster state update. |
| 50 | + */ |
| 51 | + public static final Setting<Integer> CLUSTER_MASTER_NODES_FAILURE_TOLERANCE = |
| 52 | + Setting.intSetting("cluster.master_nodes_failure_tolerance", 0, 0, Property.NodeScope, Property.Dynamic); |
| 53 | + // the default is not supposed to be important since we expect to set this setting explicitly at bootstrapping time |
| 54 | + // TODO contemplate setting the default to something larger than 0 (1? 1<<30?) |
| 55 | + |
| 56 | + private volatile int masterNodesFailureTolerance; |
| 57 | + |
| 58 | + public Reconfigurator(Settings settings, ClusterSettings clusterSettings) { |
| 59 | + super(settings); |
| 60 | + masterNodesFailureTolerance = CLUSTER_MASTER_NODES_FAILURE_TOLERANCE.get(settings); |
| 61 | + clusterSettings.addSettingsUpdateConsumer(CLUSTER_MASTER_NODES_FAILURE_TOLERANCE, this::setMasterNodesFailureTolerance); |
| 62 | + } |
| 63 | + |
| 64 | + public void setMasterNodesFailureTolerance(int masterNodesFailureTolerance) { |
| 65 | + this.masterNodesFailureTolerance = masterNodesFailureTolerance; |
| 66 | + } |
| 67 | + |
| 68 | + private static int roundDownToOdd(int size) { |
| 69 | + return size - (size % 2 == 0 ? 1 : 0); |
| 70 | + } |
| 71 | + |
| 72 | + @Override |
| 73 | + public String toString() { |
| 74 | + return "Reconfigurator{" + |
| 75 | + "masterNodesFailureTolerance=" + masterNodesFailureTolerance + |
| 76 | + '}'; |
| 77 | + } |
| 78 | + |
| 79 | + /** |
| 80 | + * Compute an optimal configuration for the cluster. |
| 81 | + * |
| 82 | + * @param liveNodes The live nodes in the cluster. The optimal configuration prefers live nodes over non-live nodes as far as |
| 83 | + * possible. |
| 84 | + * @param retiredNodeIds Nodes that are leaving the cluster and which should not appear in the configuration if possible. Nodes that are |
| 85 | + * retired and not in the current configuration will never appear in the resulting configuration; this is useful |
| 86 | + * for shifting the vote in a 2-node cluster so one of the nodes can be restarted without harming availability. |
| 87 | + * @param currentConfig The current configuration. As far as possible, we prefer to keep the current config as-is. |
| 88 | + * @return An optimal configuration, or leave the current configuration unchanged if the optimal configuration has no live quorum. |
| 89 | + */ |
| 90 | + public ClusterState.VotingConfiguration reconfigure(Set<DiscoveryNode> liveNodes, Set<String> retiredNodeIds, |
| 91 | + ClusterState.VotingConfiguration currentConfig) { |
| 92 | + logger.trace("{} reconfiguring {} based on liveNodes={}, retiredNodeIds={}", this, currentConfig, liveNodes, retiredNodeIds); |
| 93 | + |
| 94 | + final int safeConfigurationSize = 2 * masterNodesFailureTolerance + 1; |
| 95 | + if (currentConfig.getNodeIds().size() < safeConfigurationSize) { |
| 96 | + throw new AssertionError(currentConfig + " is smaller than expected " + safeConfigurationSize); |
| 97 | + } |
| 98 | + |
| 99 | + /* |
| 100 | + * There are three true/false properties of each node in play: live/non-live, retired/non-retired and in-config/not-in-config. |
| 101 | + * Firstly we divide the nodes into disjoint sets based on these properties: |
| 102 | + * |
| 103 | + * - retiredInConfigNotLiveIds |
| 104 | + * - nonRetiredInConfigNotLiveIds |
| 105 | + * - retiredInConfigLiveIds |
| 106 | + * - nonRetiredInConfigLiveIds |
| 107 | + * - nonRetiredLiveNotInConfigIds |
| 108 | + * |
| 109 | + * The other 3 possibilities are not relevant: |
| 110 | + * - retired, not-in-config, live -- cannot add a retired node back to the config |
| 111 | + * - retired, not-in-config, non-live -- cannot add a retired node back to the config |
| 112 | + * - non-retired, non-live, not-in-config -- no evidence this node exists at all |
| 113 | + */ |
| 114 | + |
| 115 | + final Set<String> liveNodeIds = liveNodes.stream() |
| 116 | + .filter(DiscoveryNode::isMasterNode).map(DiscoveryNode::getId).collect(Collectors.toSet()); |
| 117 | + final Set<String> liveInConfigIds = new TreeSet<>(currentConfig.getNodeIds()); |
| 118 | + liveInConfigIds.retainAll(liveNodeIds); |
| 119 | + |
| 120 | + final Set<String> inConfigNotLiveIds = Sets.sortedDifference(currentConfig.getNodeIds(), liveInConfigIds); |
| 121 | + final Set<String> retiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds); |
| 122 | + retiredInConfigNotLiveIds.retainAll(retiredNodeIds); |
| 123 | + final Set<String> nonRetiredInConfigNotLiveIds = new TreeSet<>(inConfigNotLiveIds); |
| 124 | + nonRetiredInConfigNotLiveIds.removeAll(retiredInConfigNotLiveIds); |
| 125 | + |
| 126 | + final Set<String> retiredInConfigLiveIds = new TreeSet<>(liveInConfigIds); |
| 127 | + retiredInConfigLiveIds.retainAll(retiredNodeIds); |
| 128 | + final Set<String> nonRetiredInConfigLiveIds = new TreeSet<>(liveInConfigIds); |
| 129 | + nonRetiredInConfigLiveIds.removeAll(retiredInConfigLiveIds); |
| 130 | + |
| 131 | + final Set<String> nonRetiredLiveNotInConfigIds = Sets.sortedDifference(liveNodeIds, currentConfig.getNodeIds()); |
| 132 | + nonRetiredLiveNotInConfigIds.removeAll(retiredNodeIds); |
| 133 | + |
| 134 | + /* |
| 135 | + * Now we work out how many nodes should be in the configuration: |
| 136 | + */ |
| 137 | + |
| 138 | + // ideally we want the configuration to be all the non-retired live nodes ... |
| 139 | + final int nonRetiredLiveNodeCount = nonRetiredInConfigLiveIds.size() + nonRetiredLiveNotInConfigIds.size(); |
| 140 | + |
| 141 | + // ... except one, if even, because odd configurations are slightly more resilient ... |
| 142 | + final int votingNodeCount = roundDownToOdd(nonRetiredLiveNodeCount); |
| 143 | + |
| 144 | + // ... except that the new configuration must satisfy CLUSTER_MASTER_NODES_FAILURE_TOLERANCE too: |
| 145 | + final int targetSize = Math.max(votingNodeCount, safeConfigurationSize); |
| 146 | + |
| 147 | + /* |
| 148 | + * The new configuration is formed by taking this many nodes in the following preference order: |
| 149 | + */ |
| 150 | + final ClusterState.VotingConfiguration newConfig = new ClusterState.VotingConfiguration( |
| 151 | + Stream.of(nonRetiredInConfigLiveIds, nonRetiredLiveNotInConfigIds, // live nodes first, preferring the current config |
| 152 | + retiredInConfigLiveIds, // if we need more, first use retired nodes that are still alive and haven't been removed yet |
| 153 | + nonRetiredInConfigNotLiveIds, retiredInConfigNotLiveIds) // if we need more, use non-live nodes |
| 154 | + .flatMap(Collection::stream).limit(targetSize).collect(Collectors.toSet())); |
| 155 | + |
| 156 | + if (newConfig.hasQuorum(liveNodeIds)) { |
| 157 | + return newConfig; |
| 158 | + } else { |
| 159 | + // If there are not enough live nodes to form a quorum in the newly-proposed configuration, it's better to do nothing. |
| 160 | + return currentConfig; |
| 161 | + } |
| 162 | + } |
| 163 | +} |
0 commit comments