Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding a check to the master stability health API when there is no master and the current node is not master eligible #89219

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/89219.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 89219
summary: Adding a check to the master stability health API when there is no master
and the current node is not master eligible
area: Health
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.emptyOrNullString;
import static org.hamcrest.Matchers.equalTo;
Expand Down Expand Up @@ -199,8 +200,7 @@ public void testNoQuorumSeenFromNonMasterNodes() throws Exception {
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
.build()
);
internalCluster().getInstances(CoordinationDiagnosticsService.class)
.forEach(coordinationDiagnosticsService -> CoordinationDiagnosticsService.remoteRequestInitialDelay = TimeValue.ZERO);
CoordinationDiagnosticsService.remoteRequestInitialDelay = TimeValue.ZERO;
ensureStableCluster(5);
String firstMasterNode = internalCluster().getMasterName();
List<String> nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList();
Expand Down Expand Up @@ -266,6 +266,7 @@ public boolean validateClusterForming() {
CoordinationDiagnosticsService.class,
randomMasterNodeName
);

CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = diagnosticsOnMasterEligibleNode.diagnoseMasterStability(
true
);
Expand Down Expand Up @@ -293,4 +294,76 @@ public boolean validateClusterForming() {
internalCluster().stopNode(dataNodeName); // This is needed for the test to clean itself up happily
}
}

public void testNoQuorum() throws Exception {
/*
* In this test we have three master-eligible nodes and two data-only nodes. We make it so that the two non-active
* master-eligible nodes cannot communicate with each other but can each communicate with one data-only node, and then we
* stop the active master node. Now there is no quorum so a new master cannot be elected. We set the master lookup threshold very
* low on the data nodes, so when we run the master stability check on each of the master nodes, it will see that there has been no
* master recently and because there is no quorum, so it returns a RED status. We also check that each of the data-only nodes
* reports a RED status because there is no quorum (having polled that result from the master-eligible node it can communicate
* with).
*/
CoordinationDiagnosticsService.remoteRequestInitialDelay = TimeValue.ZERO;
var settings = Settings.builder()
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
.build();
var masterNodes = internalCluster().startMasterOnlyNodes(3, settings);
var dataNodes = internalCluster().startDataOnlyNodes(2, settings);
ensureStableCluster(5);
String firstMasterNode = internalCluster().getMasterName();
List<String> nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList();
NetworkDisruption networkDisconnect = new NetworkDisruption(
new NetworkDisruption.TwoPartitions(
Set.of(nonActiveMasterNodes.get(0), dataNodes.get(0)),
Set.of(nonActiveMasterNodes.get(1), dataNodes.get(1))
),
NetworkDisruption.UNRESPONSIVE
);

internalCluster().clearDisruptionScheme();
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
internalCluster().stopNode(firstMasterNode);
for (String nonActiveMasterNode : nonActiveMasterNodes) {
CoordinationDiagnosticsService diagnosticsOnMasterEligibleNode = internalCluster().getInstance(
CoordinationDiagnosticsService.class,
nonActiveMasterNode
);
assertBusy(() -> {
CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = diagnosticsOnMasterEligibleNode
.diagnoseMasterStability(true);
assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
assertThat(
result.summary(),
anyOf(
containsString("the master eligible nodes are unable to form a quorum"),
containsString("the cause has not been determined.")
)
);
});
}
for (String dataNode : dataNodes) {
CoordinationDiagnosticsService diagnosticsOnDataNode = internalCluster().getInstance(
CoordinationDiagnosticsService.class,
dataNode
);
assertBusy(() -> {
CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = diagnosticsOnDataNode.diagnoseMasterStability(true);
assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
assertThat(
result.summary(),
anyOf(
containsString("the master eligible nodes are unable to form a quorum"),
containsString("the cause has not been determined.")
)
);
});
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@
import java.util.concurrent.TimeUnit;

import static java.util.Collections.singleton;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;

Expand Down Expand Up @@ -564,48 +563,4 @@ public void testCannotJoinLeader() throws Exception {
containsString("has been elected master, but the node being queried")
);
}

public void testNoQuorum() throws Exception {
/*
* In this test we have three master-eligible nodes. We make it so that the two non-active ones cannot communicate, and then we
* stop the active master node. Now there is no quorum so a new master cannot be elected. We set the master lookup threshold very
* low on the data nodes, so when we run the master stability check on each of the master nodes, it will see that there has been no
* master recently and because there is no quorum, so it returns a RED status.
*/
var settings = Settings.builder()
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
.build();
var masterNodes = internalCluster().startMasterOnlyNodes(3, settings);
var dataNodes = internalCluster().startDataOnlyNodes(2, settings);
ensureStableCluster(5);
String firstMasterNode = internalCluster().getMasterName();
List<String> nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList();
NetworkDisruption networkDisconnect = new NetworkDisruption(
new NetworkDisruption.TwoPartitions(
Set.of(nonActiveMasterNodes.get(0), dataNodes.get(0)),
Set.of(nonActiveMasterNodes.get(1), dataNodes.get(1))
),
NetworkDisruption.UNRESPONSIVE
);

internalCluster().clearDisruptionScheme();
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
internalCluster().stopNode(firstMasterNode);
for (String nonActiveMasterNode : nonActiveMasterNodes) {
assertMasterStability(
internalCluster().client(nonActiveMasterNode),
HealthStatus.RED,
anyOf(
containsString("unable to form a quorum"),
containsString("No master node observed in the last 1s, and the cause has not been determined.")
// later happens if master node has not replied within 1s
)
);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,12 @@ private CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecently(Master
DiscoveryNode currentMaster = coordinator.getPeerFinder().getLeader().get();
result = getResultOnCannotJoinLeader(localMasterHistory, currentMaster, explain);
} else if (isLocalNodeMasterEligible == false) { // none is elected master and we aren't master eligible
// NOTE: The logic in this block will be implemented in a future PR
result = new CoordinationDiagnosticsResult(
CoordinationDiagnosticsStatus.RED,
"No master has been observed recently",
CoordinationDiagnosticsDetails.EMPTY
result = diagnoseOnHaveNotSeenMasterRecentlyAndWeAreNotMasterEligible(
localMasterHistory,
coordinator,
nodeHasMasterLookupTimeframe,
remoteCoordinationDiagnosisResult,
explain
);
} else { // none is elected master and we are master eligible
result = diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible(
Expand All @@ -389,6 +390,91 @@ private CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecently(Master
return result;
}

/**
* This method handles the case when we have not had an elected master node recently, and we are on a node that is not
* master-eligible. In this case we reach out to some master-eligible node in order to see what it knows about master stability.
* @param localMasterHistory The master history, as seen from this node
* @param coordinator The Coordinator for this node
* @param nodeHasMasterLookupTimeframe The value of health.master_history.has_master_lookup_timeframe
* @param remoteCoordinationDiagnosisResult A reference to the result of polling a master-eligible node for diagnostic information
* @param explain If true, details are returned
* @return A CoordinationDiagnosticsResult that will be determined by the CoordinationDiagnosticsResult returned by the remote
* master-eligible node
*/
static CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecentlyAndWeAreNotMasterEligible(
MasterHistory localMasterHistory,
Coordinator coordinator,
TimeValue nodeHasMasterLookupTimeframe,
AtomicReference<RemoteMasterHealthResult> remoteCoordinationDiagnosisResult,
boolean explain
) {
RemoteMasterHealthResult remoteResultOrException = remoteCoordinationDiagnosisResult == null
? null
: remoteCoordinationDiagnosisResult.get();
final CoordinationDiagnosticsStatus status;
final String summary;
final CoordinationDiagnosticsDetails details;
if (remoteResultOrException == null) {
status = CoordinationDiagnosticsStatus.RED;
summary = String.format(
Locale.ROOT,
"No master node observed in the last %s, and this node is not master eligible. Reaching out to a master-eligible node"
+ " for more information",
nodeHasMasterLookupTimeframe
);
if (explain) {
details = getDetails(
true,
localMasterHistory,
null,
Map.of(coordinator.getLocalNode().getId(), coordinator.getClusterFormationState().getDescription())
);
} else {
details = CoordinationDiagnosticsDetails.EMPTY;
}
} else {
DiscoveryNode remoteNode = remoteResultOrException.node;
CoordinationDiagnosticsResult remoteResult = remoteResultOrException.result;
Exception exception = remoteResultOrException.remoteException;
if (remoteResult != null) {
if (remoteResult.status().equals(CoordinationDiagnosticsStatus.GREEN) == false) {
status = remoteResult.status();
summary = remoteResult.summary();
} else {
status = CoordinationDiagnosticsStatus.RED;
summary = String.format(
Locale.ROOT,
"No master node observed in the last %s from this node, but %s reports that the status is GREEN. This "
+ "indicates that there is a discovery problem on %s",
nodeHasMasterLookupTimeframe,
remoteNode.getName(),
coordinator.getLocalNode().getName()
);
}
if (explain) {
details = remoteResult.details();
} else {
details = CoordinationDiagnosticsDetails.EMPTY;
}
} else {
status = CoordinationDiagnosticsStatus.RED;
summary = String.format(
Locale.ROOT,
"No master node observed in the last %s from this node, and received an exception while reaching out to %s for "
+ "diagnosis",
nodeHasMasterLookupTimeframe,
remoteNode.getName()
);
if (explain) {
details = getDetails(true, localMasterHistory, exception, null);
} else {
details = CoordinationDiagnosticsDetails.EMPTY;
}
}
}
return new CoordinationDiagnosticsResult(status, summary, details);
}

/**
* This method handles the case when we have not had an elected master node recently, and we are on a master-eligible node. In this
* case we look at the cluster formation information from all master-eligible nodes, trying to understand if we have a discovery
Expand Down Expand Up @@ -1214,5 +1300,14 @@ public void writeTo(StreamOutput out) throws IOException {
}

// Non-private for testing:
record RemoteMasterHealthResult(DiscoveryNode node, CoordinationDiagnosticsResult result, Exception remoteException) {}
record RemoteMasterHealthResult(DiscoveryNode node, CoordinationDiagnosticsResult result, Exception remoteException) {
public RemoteMasterHealthResult {
if (node == null) {
throw new IllegalArgumentException("Node cannot be null");
}
if (result == null && remoteException == null) {
throw new IllegalArgumentException("Must provide a non-null value for one of result or remoteException");
}
}
}
}
Loading