From d159e5d2bb270045c2964355cbe5cd569b9d9b60 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Sat, 2 Nov 2019 20:43:20 +0100 Subject: [PATCH] Cleanup Concurrent RepositoryData Loading (#48329) (#48835) The loading of `RepositoryData` is not an atomic operation. It uses a list + get combination of calls. This lead to accidentally returning an empty repository data for generations >=0 which can never not exist unless the repository is corrupted. In the test #48122 (and other SLM tests) there was a low chance of running into this concurrent modification scenario and the repository actually moving two index generations between listing out the index-N and loading the latest version of it. Since we only keep two index-N around at a time this lead to unexpectedly absent snapshots in status APIs. Fixing the behavior to be more resilient is non-trivial but in the works. For now I think we should simply throw in this scenario. This will also help prevent corruption in the unlikely event but possible of running into this issue in a snapshot create or delete operation on master failover on a repository like S3 which doesn't have the "no overwrites" protection on writing a new index-N. Fixes #48122 --- .../blobstore/BlobStoreRepository.java | 10 +------ .../slm/SLMSnapshotBlockingIntegTests.java | 26 ++++++++++++------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java index 0274ed2dc7416..240920635fdb1 100644 --- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java @@ -849,9 +849,6 @@ public void endVerification(String seed) { public RepositoryData getRepositoryData() { try { return getRepositoryData(latestIndexBlobId()); - } catch (NoSuchFileException ex) { - // repository doesn't have an index blob, its a new blank repo - return RepositoryData.EMPTY; } catch (IOException ioe) { throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe); } @@ -864,17 +861,12 @@ private RepositoryData getRepositoryData(long indexGen) { try { final String snapshotsIndexBlobName = INDEX_FILE_PREFIX + Long.toString(indexGen); - RepositoryData repositoryData; // EMPTY is safe here because RepositoryData#fromXContent calls namedObject try (InputStream blob = blobContainer().readBlob(snapshotsIndexBlobName); XContentParser parser = XContentType.JSON.xContent().createParser(NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, blob)) { - repositoryData = RepositoryData.snapshotsFromXContent(parser, indexGen); + return RepositoryData.snapshotsFromXContent(parser, indexGen); } - return repositoryData; - } catch (NoSuchFileException ex) { - // repository doesn't have an index blob, its a new blank repo - return RepositoryData.EMPTY; } catch (IOException ioe) { throw new RepositoryException(metadata.name(), "could not read repository data from index blob", ioe); } diff --git a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java index 284376902f2a1..d0e03769ee838 100644 --- a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java +++ b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java @@ -410,18 +410,24 @@ private void testUnsuccessfulSnapshotRetention(boolean partialSuccess) throws Ex logger.info("--> waiting for {} snapshot [{}] to be deleted", expectedUnsuccessfulState, failedSnapshotName.get()); assertBusy(() -> { try { + try { + GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster() + .prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get(); + assertThat(snapshotsStatusResponse.getSnapshots(), empty()); + } catch (SnapshotMissingException e) { + // This is what we want to happen + } + logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists", + expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get()); GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster() - .prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get(); - assertThat(snapshotsStatusResponse.getSnapshots(), empty()); - } catch (SnapshotMissingException e) { - // This is what we want to happen + .prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get(); + SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0); + assertEquals(SnapshotState.SUCCESS, snapshotInfo.state()); + } catch (RepositoryException re) { + // Concurrent status calls and write operations may lead to failures in determining the current repository generation + // TODO: Remove this hack once tracking the current repository generation has been made consistent + throw new AssertionError(re); } - logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists", - expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get()); - GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster() - .prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get(); - SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0); - assertEquals(SnapshotState.SUCCESS, snapshotInfo.state()); }); } }