Skip to content

Commit 250973a

Browse files
committed
Fix testCannotJoinIfMasterLostDataFolder
Relates to elastic#41047
1 parent a79cd77 commit 250973a

File tree

1 file changed

+18
-5
lines changed

1 file changed

+18
-5
lines changed

server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.elasticsearch.cluster.ClusterState;
3232
import org.elasticsearch.cluster.action.shard.ShardStateAction;
3333
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
34+
import org.elasticsearch.cluster.coordination.LagDetector;
3435
import org.elasticsearch.cluster.metadata.IndexMetaData;
3536
import org.elasticsearch.cluster.routing.Murmur3HashFunction;
3637
import org.elasticsearch.cluster.routing.ShardRouting;
@@ -388,7 +389,6 @@ public void onFailure(Exception e) {
388389
}
389390
}
390391

391-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/41047")
392392
public void testCannotJoinIfMasterLostDataFolder() throws Exception {
393393
String masterNode = internalCluster().startMasterOnlyNode();
394394
String dataNode = internalCluster().startDataOnlyNode();
@@ -401,7 +401,18 @@ public boolean clearData(String nodeName) {
401401

402402
@Override
403403
public Settings onNodeStopped(String nodeName) {
404-
return Settings.builder().put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName).build();
404+
return Settings.builder()
405+
.put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName)
406+
/*
407+
* the data node might join while the master is still not fully established as master just yet and bypasses the join
408+
* validation that is done before adding the node to the cluster. Only the join validation when handling the publish
409+
* request takes place, but at this point the cluster state has been successfully committed, and will subsequently be
410+
* exposed to the applier. The health check below therefore sees the cluster state with the 2 nodes and thinks all is
411+
* good, even though the data node never accepted this state. What's worse is that it takes 90 seconds for the data
412+
* node to be kicked out of the cluster (lag detection). We speed this up here.
413+
*/
414+
.put(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.getKey(), "10s")
415+
.build();
405416
}
406417

407418
@Override
@@ -410,9 +421,11 @@ public boolean validateClusterForming() {
410421
}
411422
});
412423

413-
assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
414-
assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
415-
.isTimedOut());
424+
assertBusy(() -> {
425+
assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
426+
assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
427+
.isTimedOut());
428+
}, 30, TimeUnit.SECONDS);
416429
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNode)); // otherwise we will fail during clean-up
417430
}
418431

0 commit comments

Comments
 (0)