3131import org .elasticsearch .cluster .ClusterState ;
3232import org .elasticsearch .cluster .action .shard .ShardStateAction ;
3333import org .elasticsearch .cluster .coordination .ClusterBootstrapService ;
34+ import org .elasticsearch .cluster .coordination .LagDetector ;
3435import org .elasticsearch .cluster .metadata .IndexMetaData ;
3536import org .elasticsearch .cluster .routing .Murmur3HashFunction ;
3637import org .elasticsearch .cluster .routing .ShardRouting ;
@@ -388,7 +389,6 @@ public void onFailure(Exception e) {
388389 }
389390 }
390391
391- @ AwaitsFix (bugUrl = "https://github.com/elastic/elasticsearch/issues/41047" )
392392 public void testCannotJoinIfMasterLostDataFolder () throws Exception {
393393 String masterNode = internalCluster ().startMasterOnlyNode ();
394394 String dataNode = internalCluster ().startDataOnlyNode ();
@@ -401,7 +401,18 @@ public boolean clearData(String nodeName) {
401401
402402 @ Override
403403 public Settings onNodeStopped (String nodeName ) {
404- return Settings .builder ().put (ClusterBootstrapService .INITIAL_MASTER_NODES_SETTING .getKey (), nodeName ).build ();
404+ return Settings .builder ()
405+ .put (ClusterBootstrapService .INITIAL_MASTER_NODES_SETTING .getKey (), nodeName )
406+ /*
407+ * the data node might join while the master is still not fully established as master just yet and bypasses the join
408+ * validation that is done before adding the node to the cluster. Only the join validation when handling the publish
409+ * request takes place, but at this point the cluster state has been successfully committed, and will subsequently be
410+ * exposed to the applier. The health check below therefore sees the cluster state with the 2 nodes and thinks all is
411+ * good, even though the data node never accepted this state. What's worse is that it takes 90 seconds for the data
412+ * node to be kicked out of the cluster (lag detection). We speed this up here.
413+ */
414+ .put (LagDetector .CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING .getKey (), "10s" )
415+ .build ();
405416 }
406417
407418 @ Override
@@ -410,9 +421,11 @@ public boolean validateClusterForming() {
410421 }
411422 });
412423
413- assertFalse (internalCluster ().client (masterNode ).admin ().cluster ().prepareHealth ().get ().isTimedOut ());
414- assertTrue (internalCluster ().client (masterNode ).admin ().cluster ().prepareHealth ().setWaitForNodes ("2" ).setTimeout ("2s" ).get ()
415- .isTimedOut ());
424+ assertBusy (() -> {
425+ assertFalse (internalCluster ().client (masterNode ).admin ().cluster ().prepareHealth ().get ().isTimedOut ());
426+ assertTrue (internalCluster ().client (masterNode ).admin ().cluster ().prepareHealth ().setWaitForNodes ("2" ).setTimeout ("2s" ).get ()
427+ .isTimedOut ());
428+ }, 30 , TimeUnit .SECONDS );
416429 internalCluster ().stopRandomNode (InternalTestCluster .nameFilter (dataNode )); // otherwise we will fail during clean-up
417430 }
418431
0 commit comments