@@ -1236,6 +1236,55 @@ public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {
12361236 }, 60L , TimeUnit .SECONDS );
12371237 }
12381238
1239+ public void testDataNodeRestartAfterShardSnapshotFailure () throws Exception {
1240+ logger .info ("--> starting a master node and two data nodes" );
1241+ internalCluster ().startMasterOnlyNode ();
1242+ final List <String > dataNodes = internalCluster ().startDataOnlyNodes (2 );
1243+ logger .info ("--> creating repository" );
1244+ assertAcked (client ().admin ().cluster ().preparePutRepository ("test-repo" )
1245+ .setType ("mock" ).setSettings (Settings .builder ()
1246+ .put ("location" , randomRepoPath ())
1247+ .put ("compress" , randomBoolean ())
1248+ .put ("chunk_size" , randomIntBetween (100 , 1000 ), ByteSizeUnit .BYTES )));
1249+ assertAcked (prepareCreate ("test-idx" , 0 , Settings .builder ()
1250+ .put ("number_of_shards" , 2 ).put ("number_of_replicas" , 0 )));
1251+ ensureGreen ();
1252+ logger .info ("--> indexing some data" );
1253+ final int numdocs = randomIntBetween (50 , 100 );
1254+ IndexRequestBuilder [] builders = new IndexRequestBuilder [numdocs ];
1255+ for (int i = 0 ; i < builders .length ; i ++) {
1256+ builders [i ] = client ().prepareIndex ("test-idx" , "type1" ,
1257+ Integer .toString (i )).setSource ("field1" , "bar " + i );
1258+ }
1259+ indexRandom (true , builders );
1260+ flushAndRefresh ();
1261+ blockAllDataNodes ("test-repo" );
1262+ logger .info ("--> snapshot" );
1263+ client (internalCluster ().getMasterName ()).admin ().cluster ()
1264+ .prepareCreateSnapshot ("test-repo" , "test-snap" ).setWaitForCompletion (false ).setIndices ("test-idx" ).get ();
1265+ logger .info ("--> restarting first data node, which should cause the primary shard on it to be failed" );
1266+ internalCluster ().restartNode (dataNodes .get (0 ), InternalTestCluster .EMPTY_CALLBACK );
1267+
1268+ logger .info ("--> wait for shard snapshot of first primary to show as failed" );
1269+ assertBusy (() -> assertThat (
1270+ client ().admin ().cluster ().prepareSnapshotStatus ("test-repo" ).setSnapshots ("test-snap" ).get ().getSnapshots ()
1271+ .get (0 ).getShardsStats ().getFailedShards (), is (1 )), 60L , TimeUnit .SECONDS );
1272+
1273+ logger .info ("--> restarting second data node, which should cause the primary shard on it to be failed" );
1274+ internalCluster ().restartNode (dataNodes .get (1 ), InternalTestCluster .EMPTY_CALLBACK );
1275+
1276+ // check that snapshot completes with both failed shards being accounted for in the snapshot result
1277+ assertBusy (() -> {
1278+ GetSnapshotsResponse snapshotsStatusResponse = client ().admin ().cluster ()
1279+ .prepareGetSnapshots ("test-repo" ).setSnapshots ("test-snap" ).setIgnoreUnavailable (true ).get ();
1280+ assertEquals (1 , snapshotsStatusResponse .getSnapshots ().size ());
1281+ SnapshotInfo snapshotInfo = snapshotsStatusResponse .getSnapshots ().get (0 );
1282+ assertTrue (snapshotInfo .state ().toString (), snapshotInfo .state ().completed ());
1283+ assertThat (snapshotInfo .totalShards (), is (2 ));
1284+ assertThat (snapshotInfo .shardFailures (), hasSize (2 ));
1285+ }, 60L , TimeUnit .SECONDS );
1286+ }
1287+
12391288 public void testRetentionLeasesClearedOnRestore () throws Exception {
12401289 final String repoName = "test-repo-retention-leases" ;
12411290 assertAcked (client ().admin ().cluster ().preparePutRepository (repoName )
0 commit comments