diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java index f169442e9e20f..2e98195da4314 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java @@ -380,9 +380,8 @@ private void startElection() { // The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have // to check our mode again here. if (mode == Mode.CANDIDATE) { - if (electionQuorumContainsLocalNode(getLastAcceptedState()) == false) { - logger.trace("skip election as local node is not part of election quorum: {}", - getLastAcceptedState().coordinationMetaData()); + if (localNodeMayWinElection(getLastAcceptedState()) == false) { + logger.trace("skip election as local node may not win it: {}", getLastAcceptedState().coordinationMetaData()); return; } @@ -415,16 +414,17 @@ private void abdicateTo(DiscoveryNode newMaster) { becomeCandidate("after abdicating to " + newMaster); } - private static boolean electionQuorumContainsLocalNode(ClusterState lastAcceptedState) { + private static boolean localNodeMayWinElection(ClusterState lastAcceptedState) { final DiscoveryNode localNode = lastAcceptedState.nodes().getLocalNode(); assert localNode != null; - return electionQuorumContains(lastAcceptedState, localNode); + return nodeMayWinElection(lastAcceptedState, localNode); } - private static boolean electionQuorumContains(ClusterState lastAcceptedState, DiscoveryNode node) { + private static boolean nodeMayWinElection(ClusterState lastAcceptedState, DiscoveryNode node) { final String nodeId = node.getId(); return lastAcceptedState.getLastCommittedConfiguration().getNodeIds().contains(nodeId) - || lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId); + || lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId) + || lastAcceptedState.getVotingConfigExclusions().stream().noneMatch(vce -> vce.getNodeId().equals(nodeId)); } private Optional ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) { @@ -867,8 +867,8 @@ public boolean setInitialConfiguration(final VotingConfiguration votingConfigura metaDataBuilder.coordinationMetaData(coordinationMetaData); coordinationState.get().setInitialState(ClusterState.builder(currentState).metaData(metaDataBuilder).build()); - assert electionQuorumContainsLocalNode(getLastAcceptedState()) : - "initial state does not have local node in its election quorum: " + getLastAcceptedState().coordinationMetaData(); + assert localNodeMayWinElection(getLastAcceptedState()) : + "initial state does not allow local node to win election: " + getLastAcceptedState().coordinationMetaData(); preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version startElectionScheduler(); return true; @@ -1164,8 +1164,8 @@ public void run() { if (mode == Mode.CANDIDATE) { final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState(); - if (electionQuorumContainsLocalNode(lastAcceptedState) == false) { - logger.trace("skip prevoting as local node is not part of election quorum: {}", + if (localNodeMayWinElection(lastAcceptedState) == false) { + logger.trace("skip prevoting as local node may not win election: {}", lastAcceptedState.coordinationMetaData()); return; } @@ -1329,16 +1329,20 @@ public void onSuccess(String source) { updateMaxTermSeen(getCurrentTerm()); if (mode == Mode.LEADER) { + // if necessary, abdicate to another node or improve the voting configuration + boolean attemptReconfiguration = true; final ClusterState state = getLastAcceptedState(); // committed state - if (electionQuorumContainsLocalNode(state) == false) { + if (localNodeMayWinElection(state) == false) { final List masterCandidates = completedNodes().stream() .filter(DiscoveryNode::isMasterNode) - .filter(node -> electionQuorumContains(state, node)) + .filter(node -> nodeMayWinElection(state, node)) .collect(Collectors.toList()); if (masterCandidates.isEmpty() == false) { abdicateTo(masterCandidates.get(random.nextInt(masterCandidates.size()))); + attemptReconfiguration = false; } - } else { + } + if (attemptReconfiguration) { scheduleReconfigurationIfNeeded(); } } diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java index 7a3a54d73b2fe..fd9bab3af11ec 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Reconfigurator.java @@ -150,6 +150,11 @@ static class VotingConfigNode implements Comparable { @Override public int compareTo(VotingConfigNode other) { + // prefer current master + final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster); + if (currentMasterComp != 0) { + return currentMasterComp; + } // prefer nodes that are live final int liveComp = Boolean.compare(other.live, live); if (liveComp != 0) { @@ -160,11 +165,6 @@ public int compareTo(VotingConfigNode other) { if (inCurrentConfigComp != 0) { return inCurrentConfigComp; } - // prefer current master - final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster); - if (currentMasterComp != 0) { - return currentMasterComp; - } // tiebreak by node id to have stable ordering return id.compareTo(other.id); } diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java index bbd9514222c77..a1d12d98398ca 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/ReconfiguratorTests.java @@ -53,7 +53,7 @@ public void testReconfigurationExamples() { check(nodes("a"), conf("a"), true, conf("a")); check(nodes("a", "b"), conf("a"), true, conf("a")); - check(nodes("a", "b"), conf("b"), true, conf("b")); + check(nodes("a", "b"), conf("b"), true, conf("a")); check(nodes("a", "b"), conf("a", "c"), true, conf("a")); check(nodes("a", "b"), conf("a", "b"), true, conf("a")); check(nodes("a", "b"), conf("a", "b", "e"), true, conf("a", "b", "e")); diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java b/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java index 8c6775cb6c91e..c3c3abf8866b7 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/VotingConfigurationIT.java @@ -18,17 +18,38 @@ */ package org.elasticsearch.cluster.coordination; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction; import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.common.Priority; +import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.transport.MockTransportService; +import org.elasticsearch.transport.TransportService; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Set; import java.util.concurrent.ExecutionException; -@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0) +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasItem; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.nullValue; + +@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false) public class VotingConfigurationIT extends ESIntegTestCase { + @Override + protected Collection> nodePlugins() { + return Collections.singletonList(MockTransportService.TestPlugin.class); + } + public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionException, InterruptedException { + internalCluster().setBootstrapMasterNodeIndex(0); internalCluster().startNodes(2); final String originalMaster = internalCluster().getMasterName(); @@ -38,4 +59,56 @@ public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionExcept client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get(); assertNotEquals(originalMaster, internalCluster().getMasterName()); } + + public void testElectsNodeNotInVotingConfiguration() throws Exception { + internalCluster().setBootstrapMasterNodeIndex(0); + final List nodeNames = internalCluster().startNodes(4); + + // a 4-node cluster settles on a 3-node configuration; we then prevent the nodes in the configuration from winning an election + // by failing at the pre-voting stage, so that the extra node must be elected instead when the master shuts down. This extra node + // should then add itself into the voting configuration. + + assertFalse(internalCluster().client().admin().cluster().prepareHealth() + .setWaitForNodes("4").setWaitForEvents(Priority.LANGUID).get().isTimedOut()); + + String excludedNodeName = null; + final ClusterState clusterState + = internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState(); + final Set votingConfiguration = clusterState.getLastCommittedConfiguration().getNodeIds(); + assertThat(votingConfiguration, hasSize(3)); + assertThat(clusterState.nodes().getSize(), equalTo(4)); + assertThat(votingConfiguration, hasItem(clusterState.nodes().getMasterNodeId())); + for (DiscoveryNode discoveryNode : clusterState.nodes()) { + if (votingConfiguration.contains(discoveryNode.getId()) == false) { + assertThat(excludedNodeName, nullValue()); + excludedNodeName = discoveryNode.getName(); + } + } + + for (final String sender : nodeNames) { + if (sender.equals(excludedNodeName)) { + continue; + } + final MockTransportService senderTransportService + = (MockTransportService) internalCluster().getInstance(TransportService.class, sender); + for (final String receiver : nodeNames) { + senderTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, receiver), + (connection, requestId, action, request, options) -> { + if (action.equals(PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME)) { + throw new ElasticsearchException("rejected"); + } + connection.sendRequest(requestId, action, request, options); + }); + } + } + + internalCluster().stopCurrentMasterNode(); + assertFalse(internalCluster().client().admin().cluster().prepareHealth() + .setWaitForNodes("3").setWaitForEvents(Priority.LANGUID).get().isTimedOut()); + + final ClusterState newClusterState + = internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState(); + assertThat(newClusterState.nodes().getMasterNode().getName(), equalTo(excludedNodeName)); + assertThat(newClusterState.getLastCommittedConfiguration().getNodeIds(), hasItem(newClusterState.nodes().getMasterNodeId())); + } }