diff --git a/docs/changelog/89219.yaml b/docs/changelog/89219.yaml new file mode 100644 index 0000000000000..010c1d056ea1c --- /dev/null +++ b/docs/changelog/89219.yaml @@ -0,0 +1,6 @@ +pr: 89219 +summary: Adding a check to the master stability health API when there is no master + and the current node is not master eligible +area: Health +type: enhancement +issues: [] diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceIT.java index 4cc4589d71350..93878f8b66fc7 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceIT.java @@ -35,6 +35,7 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; +import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.emptyOrNullString; import static org.hamcrest.Matchers.equalTo; @@ -199,8 +200,7 @@ public void testNoQuorumSeenFromNonMasterNodes() throws Exception { .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS)) .build() ); - internalCluster().getInstances(CoordinationDiagnosticsService.class) - .forEach(coordinationDiagnosticsService -> CoordinationDiagnosticsService.remoteRequestInitialDelay = TimeValue.ZERO); + CoordinationDiagnosticsService.remoteRequestInitialDelay = TimeValue.ZERO; ensureStableCluster(5); String firstMasterNode = internalCluster().getMasterName(); List nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList(); @@ -266,6 +266,7 @@ public boolean validateClusterForming() { CoordinationDiagnosticsService.class, randomMasterNodeName ); + CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = diagnosticsOnMasterEligibleNode.diagnoseMasterStability( true ); @@ -293,4 +294,76 @@ public boolean validateClusterForming() { internalCluster().stopNode(dataNodeName); // This is needed for the test to clean itself up happily } } + + public void testNoQuorum() throws Exception { + /* + * In this test we have three master-eligible nodes and two data-only nodes. We make it so that the two non-active + * master-eligible nodes cannot communicate with each other but can each communicate with one data-only node, and then we + * stop the active master node. Now there is no quorum so a new master cannot be elected. We set the master lookup threshold very + * low on the data nodes, so when we run the master stability check on each of the master nodes, it will see that there has been no + * master recently and because there is no quorum, so it returns a RED status. We also check that each of the data-only nodes + * reports a RED status because there is no quorum (having polled that result from the master-eligible node it can communicate + * with). + */ + CoordinationDiagnosticsService.remoteRequestInitialDelay = TimeValue.ZERO; + var settings = Settings.builder() + .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s") + .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s") + .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1) + .put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO) + .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS)) + .build(); + var masterNodes = internalCluster().startMasterOnlyNodes(3, settings); + var dataNodes = internalCluster().startDataOnlyNodes(2, settings); + ensureStableCluster(5); + String firstMasterNode = internalCluster().getMasterName(); + List nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList(); + NetworkDisruption networkDisconnect = new NetworkDisruption( + new NetworkDisruption.TwoPartitions( + Set.of(nonActiveMasterNodes.get(0), dataNodes.get(0)), + Set.of(nonActiveMasterNodes.get(1), dataNodes.get(1)) + ), + NetworkDisruption.UNRESPONSIVE + ); + + internalCluster().clearDisruptionScheme(); + setDisruptionScheme(networkDisconnect); + networkDisconnect.startDisrupting(); + internalCluster().stopNode(firstMasterNode); + for (String nonActiveMasterNode : nonActiveMasterNodes) { + CoordinationDiagnosticsService diagnosticsOnMasterEligibleNode = internalCluster().getInstance( + CoordinationDiagnosticsService.class, + nonActiveMasterNode + ); + assertBusy(() -> { + CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = diagnosticsOnMasterEligibleNode + .diagnoseMasterStability(true); + assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED)); + assertThat( + result.summary(), + anyOf( + containsString("the master eligible nodes are unable to form a quorum"), + containsString("the cause has not been determined.") + ) + ); + }); + } + for (String dataNode : dataNodes) { + CoordinationDiagnosticsService diagnosticsOnDataNode = internalCluster().getInstance( + CoordinationDiagnosticsService.class, + dataNode + ); + assertBusy(() -> { + CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = diagnosticsOnDataNode.diagnoseMasterStability(true); + assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED)); + assertThat( + result.summary(), + anyOf( + containsString("the master eligible nodes are unable to form a quorum"), + containsString("the cause has not been determined.") + ) + ); + }); + } + } } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java b/server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java index d83712dde30da..b616da1d0735e 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java @@ -64,7 +64,6 @@ import java.util.concurrent.TimeUnit; import static java.util.Collections.singleton; -import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -564,48 +563,4 @@ public void testCannotJoinLeader() throws Exception { containsString("has been elected master, but the node being queried") ); } - - public void testNoQuorum() throws Exception { - /* - * In this test we have three master-eligible nodes. We make it so that the two non-active ones cannot communicate, and then we - * stop the active master node. Now there is no quorum so a new master cannot be elected. We set the master lookup threshold very - * low on the data nodes, so when we run the master stability check on each of the master nodes, it will see that there has been no - * master recently and because there is no quorum, so it returns a RED status. - */ - var settings = Settings.builder() - .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s") - .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s") - .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1) - .put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO) - .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS)) - .build(); - var masterNodes = internalCluster().startMasterOnlyNodes(3, settings); - var dataNodes = internalCluster().startDataOnlyNodes(2, settings); - ensureStableCluster(5); - String firstMasterNode = internalCluster().getMasterName(); - List nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList(); - NetworkDisruption networkDisconnect = new NetworkDisruption( - new NetworkDisruption.TwoPartitions( - Set.of(nonActiveMasterNodes.get(0), dataNodes.get(0)), - Set.of(nonActiveMasterNodes.get(1), dataNodes.get(1)) - ), - NetworkDisruption.UNRESPONSIVE - ); - - internalCluster().clearDisruptionScheme(); - setDisruptionScheme(networkDisconnect); - networkDisconnect.startDisrupting(); - internalCluster().stopNode(firstMasterNode); - for (String nonActiveMasterNode : nonActiveMasterNodes) { - assertMasterStability( - internalCluster().client(nonActiveMasterNode), - HealthStatus.RED, - anyOf( - containsString("unable to form a quorum"), - containsString("No master node observed in the last 1s, and the cause has not been determined.") - // later happens if master node has not replied within 1s - ) - ); - } - } } diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java b/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java index b474cb67772a5..815b1234c8e37 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java @@ -370,11 +370,12 @@ private CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecently(Master DiscoveryNode currentMaster = coordinator.getPeerFinder().getLeader().get(); result = getResultOnCannotJoinLeader(localMasterHistory, currentMaster, explain); } else if (isLocalNodeMasterEligible == false) { // none is elected master and we aren't master eligible - // NOTE: The logic in this block will be implemented in a future PR - result = new CoordinationDiagnosticsResult( - CoordinationDiagnosticsStatus.RED, - "No master has been observed recently", - CoordinationDiagnosticsDetails.EMPTY + result = diagnoseOnHaveNotSeenMasterRecentlyAndWeAreNotMasterEligible( + localMasterHistory, + coordinator, + nodeHasMasterLookupTimeframe, + remoteCoordinationDiagnosisResult, + explain ); } else { // none is elected master and we are master eligible result = diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible( @@ -389,6 +390,91 @@ private CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecently(Master return result; } + /** + * This method handles the case when we have not had an elected master node recently, and we are on a node that is not + * master-eligible. In this case we reach out to some master-eligible node in order to see what it knows about master stability. + * @param localMasterHistory The master history, as seen from this node + * @param coordinator The Coordinator for this node + * @param nodeHasMasterLookupTimeframe The value of health.master_history.has_master_lookup_timeframe + * @param remoteCoordinationDiagnosisResult A reference to the result of polling a master-eligible node for diagnostic information + * @param explain If true, details are returned + * @return A CoordinationDiagnosticsResult that will be determined by the CoordinationDiagnosticsResult returned by the remote + * master-eligible node + */ + static CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecentlyAndWeAreNotMasterEligible( + MasterHistory localMasterHistory, + Coordinator coordinator, + TimeValue nodeHasMasterLookupTimeframe, + AtomicReference remoteCoordinationDiagnosisResult, + boolean explain + ) { + RemoteMasterHealthResult remoteResultOrException = remoteCoordinationDiagnosisResult == null + ? null + : remoteCoordinationDiagnosisResult.get(); + final CoordinationDiagnosticsStatus status; + final String summary; + final CoordinationDiagnosticsDetails details; + if (remoteResultOrException == null) { + status = CoordinationDiagnosticsStatus.RED; + summary = String.format( + Locale.ROOT, + "No master node observed in the last %s, and this node is not master eligible. Reaching out to a master-eligible node" + + " for more information", + nodeHasMasterLookupTimeframe + ); + if (explain) { + details = getDetails( + true, + localMasterHistory, + null, + Map.of(coordinator.getLocalNode().getId(), coordinator.getClusterFormationState().getDescription()) + ); + } else { + details = CoordinationDiagnosticsDetails.EMPTY; + } + } else { + DiscoveryNode remoteNode = remoteResultOrException.node; + CoordinationDiagnosticsResult remoteResult = remoteResultOrException.result; + Exception exception = remoteResultOrException.remoteException; + if (remoteResult != null) { + if (remoteResult.status().equals(CoordinationDiagnosticsStatus.GREEN) == false) { + status = remoteResult.status(); + summary = remoteResult.summary(); + } else { + status = CoordinationDiagnosticsStatus.RED; + summary = String.format( + Locale.ROOT, + "No master node observed in the last %s from this node, but %s reports that the status is GREEN. This " + + "indicates that there is a discovery problem on %s", + nodeHasMasterLookupTimeframe, + remoteNode.getName(), + coordinator.getLocalNode().getName() + ); + } + if (explain) { + details = remoteResult.details(); + } else { + details = CoordinationDiagnosticsDetails.EMPTY; + } + } else { + status = CoordinationDiagnosticsStatus.RED; + summary = String.format( + Locale.ROOT, + "No master node observed in the last %s from this node, and received an exception while reaching out to %s for " + + "diagnosis", + nodeHasMasterLookupTimeframe, + remoteNode.getName() + ); + if (explain) { + details = getDetails(true, localMasterHistory, exception, null); + } else { + details = CoordinationDiagnosticsDetails.EMPTY; + } + } + } + return new CoordinationDiagnosticsResult(status, summary, details); + } + /** * This method handles the case when we have not had an elected master node recently, and we are on a master-eligible node. In this * case we look at the cluster formation information from all master-eligible nodes, trying to understand if we have a discovery @@ -1214,5 +1300,14 @@ public void writeTo(StreamOutput out) throws IOException { } // Non-private for testing: - record RemoteMasterHealthResult(DiscoveryNode node, CoordinationDiagnosticsResult result, Exception remoteException) {} + record RemoteMasterHealthResult(DiscoveryNode node, CoordinationDiagnosticsResult result, Exception remoteException) { + public RemoteMasterHealthResult { + if (node == null) { + throw new IllegalArgumentException("Node cannot be null"); + } + if (result == null && remoteException == null) { + throw new IllegalArgumentException("Must provide a non-null value for one of result or remoteException"); + } + } + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceTests.java index 4205fd7b97099..4fe654dd82025 100644 --- a/server/src/test/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceTests.java @@ -48,6 +48,7 @@ import static org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.EXTREME_DELAY_VARIABILITY; import static org.elasticsearch.cluster.coordination.CoordinationDiagnosticsService.ClusterFormationStateOrException; +import static org.elasticsearch.cluster.coordination.CoordinationDiagnosticsService.CoordinationDiagnosticsStatus; import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.emptyOrNullString; @@ -496,6 +497,84 @@ public void testRedForDiscoveryProblems() { } } + public void testRedForNoMasterQueryingNonMaster() { + /* + * This test simulates a cluster with 3 master-eligible nodes and two data nodes. It disconnects all master-eligible nodes + * except one random one, and then asserts that we get the expected response from calling diagnoseMasterStability() on each of + * the data nodes. It then sets various values for + * remoteCoordinationDiagnosisResult on each of the non-master-eligible nodes (simulating different + * responses from a master-eligible node that it has polled), and then asserts that the correct result comes back from + * diagnoseMasterStability(). + */ + try (Cluster cluster = new Cluster(3, true, Settings.EMPTY)) { + createAndAddNonMasterNode(cluster); + createAndAddNonMasterNode(cluster); + cluster.runRandomly(false, true, EXTREME_DELAY_VARIABILITY); + cluster.stabilise(); + DiscoveryNode nonKilledMasterNode = cluster.getAnyLeader().getLocalNode(); + for (Cluster.ClusterNode node : cluster.clusterNodes) { + if (node.getLocalNode().isMasterNode() && node.getLocalNode().equals(nonKilledMasterNode) == false) { + node.disconnect(); + } + } + cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master"); + for (Cluster.ClusterNode node : cluster.clusterNodes.stream() + .filter(node -> node.getLocalNode().isMasterNode() == false) + .toList()) { + CoordinationDiagnosticsService.CoordinationDiagnosticsResult healthIndicatorResult = node.coordinationDiagnosticsService + .diagnoseMasterStability(true); + assertThat(healthIndicatorResult.status(), equalTo(CoordinationDiagnosticsStatus.RED)); + String summary = healthIndicatorResult.summary(); + assertThat( + summary, + containsString("No master node observed in the last 30s, and the master eligible nodes are unable to form a quorum") + ); + CoordinationDiagnosticsStatus artificialRemoteStatus = randomValueOtherThan( + CoordinationDiagnosticsStatus.GREEN, + () -> randomFrom(CoordinationDiagnosticsStatus.values()) + ); + String artificialRemoteStatusSummary = "Artificial failure"; + CoordinationDiagnosticsService.CoordinationDiagnosticsResult artificialRemoteResult = + new CoordinationDiagnosticsService.CoordinationDiagnosticsResult( + artificialRemoteStatus, + artificialRemoteStatusSummary, + null + ); + node.coordinationDiagnosticsService.remoteCoordinationDiagnosisResult = new AtomicReference<>( + new CoordinationDiagnosticsService.RemoteMasterHealthResult(nonKilledMasterNode, artificialRemoteResult, null) + ); + healthIndicatorResult = node.coordinationDiagnosticsService.diagnoseMasterStability(true); + assertThat(healthIndicatorResult.status(), equalTo(artificialRemoteStatus)); + assertThat(healthIndicatorResult.summary(), containsString(artificialRemoteStatusSummary)); + + artificialRemoteResult = new CoordinationDiagnosticsService.CoordinationDiagnosticsResult( + CoordinationDiagnosticsStatus.GREEN, + artificialRemoteStatusSummary, + null + ); + node.coordinationDiagnosticsService.remoteCoordinationDiagnosisResult = new AtomicReference<>( + new CoordinationDiagnosticsService.RemoteMasterHealthResult(nonKilledMasterNode, artificialRemoteResult, null) + ); + healthIndicatorResult = node.coordinationDiagnosticsService.diagnoseMasterStability(true); + assertThat(healthIndicatorResult.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED)); + assertThat(healthIndicatorResult.summary(), containsString("reports that the status is GREEN")); + + Exception artificialRemoteResultException = new RuntimeException(artificialRemoteStatusSummary); + node.coordinationDiagnosticsService.remoteCoordinationDiagnosisResult = new AtomicReference<>( + new CoordinationDiagnosticsService.RemoteMasterHealthResult(nonKilledMasterNode, null, artificialRemoteResultException) + ); + healthIndicatorResult = node.coordinationDiagnosticsService.diagnoseMasterStability(true); + assertThat(healthIndicatorResult.status(), equalTo(CoordinationDiagnosticsStatus.RED)); + assertThat(healthIndicatorResult.summary(), containsString("received an exception")); + } + + while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) { + logger.debug("--> stabilising again after delivering blackholed requests"); + cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master"); + } + } + } + public void testYellowWithTooManyMasterChanges() { testChangeMasterThreeTimes(2, 100, "The elected master node has changed"); } @@ -1064,6 +1143,18 @@ public void testBeginPollingRemoteMasterStabilityDiagnosticCancel() { } } + public void testRemoteMasterHealthResult() { + expectThrows(IllegalArgumentException.class, () -> new CoordinationDiagnosticsService.RemoteMasterHealthResult(null, null, null)); + expectThrows( + IllegalArgumentException.class, + () -> new CoordinationDiagnosticsService.RemoteMasterHealthResult(null, null, new RuntimeException()) + ); + expectThrows( + IllegalArgumentException.class, + () -> new CoordinationDiagnosticsService.RemoteMasterHealthResult(mock(DiscoveryNode.class), null, null) + ); + } + public void testResultSerialization() { CoordinationDiagnosticsService.CoordinationDiagnosticsStatus status = getRandomStatus(); CoordinationDiagnosticsService.CoordinationDiagnosticsDetails details = getRandomDetails(); diff --git a/test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java b/test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java index 971cf9c57d484..49664761f7897 100644 --- a/test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java @@ -1294,6 +1294,7 @@ public RecyclerBytesStreamOutput newNetworkBytesStream() { coordinator.start(); gatewayService.start(); clusterService.start(); + coordinationDiagnosticsService.start(); coordinator.startInitialJoin(); }