Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding logic to master_is_stable indicator to check for discovery problems #88020

Merged
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
422f723
Adding logic to master_is_stable indicator to check for discovery pro…
masseyke Jun 23, 2022
6e21479
Merge branch 'master' into feature/health-api-master-stability-discovery
masseyke Jun 28, 2022
01f44a1
improved indicator summaries and better unit test
masseyke Jun 28, 2022
22c649b
cleaning up and unit testing
masseyke Jun 28, 2022
89a44c9
getting rid of unneeded getDiscoveryProblems and getQuorumProblems co…
masseyke Jun 29, 2022
5060780
commenting about nodeToClusterFormationStateOrExceptionMap
masseyke Jun 29, 2022
6ddbf08
merging master
masseyke Jun 29, 2022
942385b
spotlessApply
masseyke Jun 29, 2022
342ce84
Adding an integration test for no quorum
masseyke Jun 30, 2022
1b633e6
Merge branch 'master' into feature/health-api-master-stability-discovery
masseyke Jun 30, 2022
f2c4d3f
Update docs/changelog/88020.yaml
masseyke Jun 30, 2022
30d68a8
code review feedback
masseyke Jul 5, 2022
b9d4852
Merge branch 'feature/health-api-master-stability-discovery' of githu…
masseyke Jul 5, 2022
940d104
avoiding forbidden API
masseyke Jul 5, 2022
3e5bc95
code review feedback
masseyke Jul 6, 2022
6e2ea39
adding a unit test for beginPollingClusterFormationInfo
masseyke Jul 7, 2022
870f79d
improving beginPollingClusterFormationInfo()
masseyke Jul 8, 2022
fc3dfd3
more unit testing
masseyke Jul 11, 2022
462c892
Merge branch 'master' into feature/health-api-master-stability-discovery
elasticmachine Jul 12, 2022
7455e3d
adding unit tests
masseyke Jul 12, 2022
d5a7423
Merge branch 'feature/health-api-master-stability-discovery' of githu…
masseyke Jul 12, 2022
05e83fb
merging master
masseyke Jul 14, 2022
f1b17b0
documenting and improving unit tests
masseyke Jul 14, 2022
e078f64
minor cleanup
masseyke Jul 14, 2022
e214e4a
minor doc cleanup
masseyke Jul 14, 2022
aeef0ed
fixing a unit test
masseyke Jul 18, 2022
93c6f5f
Putting cluster formation info into the details
masseyke Jul 25, 2022
8e2f48f
merging main
masseyke Jul 25, 2022
07939f0
code review feedback
masseyke Jul 26, 2022
3a0b817
Using node name instead of id
masseyke Jul 27, 2022
2e6e53d
reverting previous commit
masseyke Jul 27, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/88020.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 88020
summary: Adding logic to `master_is_stable` indicator to check for discovery problems
area: Health
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.elasticsearch.xcontent.ToXContentObject;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.json.JsonXContent;
import org.junit.Before;

import java.io.IOException;
import java.util.ArrayList;
Expand All @@ -69,9 +70,14 @@
* Tests relating to the loss of the master, but which work with the default fault detection settings which are rather lenient and will
* not detect a master failure too quickly.
*/
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false)
public class StableMasterDisruptionIT extends ESIntegTestCase {

@Before
private void setBootstrapMasterNodeIndex() {
internalCluster().setBootstrapMasterNodeIndex(0);
}

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singletonList(MockTransportService.TestPlugin.class);
Expand Down Expand Up @@ -560,4 +566,51 @@ public void testCannotJoinLeader() throws Exception {
"has been elected master, but the node being queried"
);
}

public void testNoQuorum() throws Exception {
/*
* In this test we have three master-eligible nodes. We make it so that the two non-active ones cannot communicate, and then we
* stop the active master node. Now there is no quorum so a new master cannot be elected. We set the master lookup threshold very
* low on the data nodes, so when we run the master stability check on each of the master nodes, it will see that there has been no
* master recently and because there is no quorum, so it returns a RED status.
*/
final List<String> masterNodes = internalCluster().startMasterOnlyNodes(
3,
Settings.builder()
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
.build()
);
final List<String> dataNodes = internalCluster().startDataOnlyNodes(
2,
Settings.builder()
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
.put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
.put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
.put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
.build()
);
ensureStableCluster(5);
String firstMasterNode = internalCluster().getMasterName();
List<String> nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList();
NetworkDisruption networkDisconnect = new NetworkDisruption(
new NetworkDisruption.TwoPartitions(
Set.of(nonActiveMasterNodes.get(0), dataNodes.get(0)),
Set.of(nonActiveMasterNodes.get(1), dataNodes.get(1))
),
NetworkDisruption.UNRESPONSIVE
);

internalCluster().clearDisruptionScheme();
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
internalCluster().stopNode(firstMasterNode);
for (String nonActiveMasterNode : nonActiveMasterNodes) {
assertMasterStability(internalCluster().client(nonActiveMasterNode), HealthStatus.RED, "unable to form a quorum");
}
}
}
Loading