From 0ce0d9d1928a9cc572120af19d00206a91c023a6 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Sun, 15 Oct 2023 21:36:12 -0700 Subject: [PATCH 1/2] Fix bug where retries within RemoteStoreRefreshListener cause mismatch between ReplicationCheckpoint and uploaded SegmentInfos. Retries within RemoteStoreRefreshListener run outside of the refresh thread. This means that concurrent refreshes may occur during syncSegments execution updating the on-reader SegmentInfos. A shard's latest ReplicationCheckpoint is computed and set in a refresh listener, but it is not guaranteed the listener has run before the retry fetches the infos or checkpoint independently. This fix ensures the listener recomputes the checkpoint while fetching the SegmentInfos. This change also ensures that we only recompute the checkpoint when necessary because it comes with an IO cost to compute StoreFileMetadata. Signed-off-by: Marc Handalian Update refresh listener to recompute checkpoint from latest infos snapshot. Signed-off-by: Marc Handalian Fix broken test case by comparing segments gen Signed-off-by: Marc Handalian spotless Signed-off-by: Marc Handalian Fix RemoteStoreRefreshListener tests Signed-off-by: Marc Handalian --- .../opensearch/index/shard/IndexShard.java | 65 +++++++++++-------- .../shard/RemoteStoreRefreshListener.java | 6 +- .../RemoteStoreRefreshListenerTests.java | 4 +- .../SegmentReplicationIndexShardTests.java | 27 ++++++++ 4 files changed, 68 insertions(+), 34 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 9489c7d7fc1dd..fa29840f1fa8b 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -1608,8 +1608,11 @@ public GatedCloseable acquireSafeIndexCommit() throws EngineExcepti } /** - * Compute and return the latest ReplicationCheckpoint for a particular shard. - * @return EMPTY checkpoint before the engine is opened and null for non-segrep enabled indices + * return the most recently computed ReplicationCheckpoint for a particular shard. + * The checkpoint is updated inside a refresh listener and may lag behind the SegmentInfos on the reader. + * To guarantee the checkpoint is upto date with the latest on-reader infos, use `getLatestSegmentInfosAndCheckpoint` instead. + * + * @return {@link ReplicationCheckpoint} - The most recently computed ReplicationCheckpoint. */ public ReplicationCheckpoint getLatestReplicationCheckpoint() { return replicationTracker.getLatestReplicationCheckpoint(); @@ -1628,34 +1631,12 @@ public ReplicationCheckpoint getLatestReplicationCheckpoint() { public Tuple, ReplicationCheckpoint> getLatestSegmentInfosAndCheckpoint() { assert indexSettings.isSegRepEnabled(); - Tuple, ReplicationCheckpoint> nullSegmentInfosEmptyCheckpoint = new Tuple<>( - new GatedCloseable<>(null, () -> {}), - getLatestReplicationCheckpoint() - ); - - if (getEngineOrNull() == null) { - return nullSegmentInfosEmptyCheckpoint; - } // do not close the snapshot - caller will close it. GatedCloseable snapshot = null; try { snapshot = getSegmentInfosSnapshot(); - if (snapshot.get() != null) { - SegmentInfos segmentInfos = snapshot.get(); - final Map metadataMap = store.getSegmentMetadataMap(segmentInfos); - return new Tuple<>( - snapshot, - new ReplicationCheckpoint( - this.shardId, - getOperationPrimaryTerm(), - segmentInfos.getGeneration(), - segmentInfos.getVersion(), - metadataMap.values().stream().mapToLong(StoreFileMetadata::length).sum(), - getEngine().config().getCodec().getName(), - metadataMap - ) - ); - } + final SegmentInfos segmentInfos = snapshot.get(); + return new Tuple<>(snapshot, computeReplicationCheckpoint(segmentInfos)); } catch (IOException | AlreadyClosedException e) { logger.error("Error Fetching SegmentInfos and latest checkpoint", e); if (snapshot != null) { @@ -1666,7 +1647,37 @@ public Tuple, ReplicationCheckpoint> getLatestSegme } } } - return nullSegmentInfosEmptyCheckpoint; + return new Tuple<>(new GatedCloseable<>(null, () -> {}), getLatestReplicationCheckpoint()); + } + + /** + * Compute the latest {@link ReplicationCheckpoint} from a SegmentInfos. + * This function fetches a metadata snapshot from the store that comes with an IO cost. + * We will reuse the existing stored checkpoint if it is at the same SI version. + * + * @param segmentInfos {@link SegmentInfos} infos to use to compute. + * @return {@link ReplicationCheckpoint} Checkpoint computed from the infos. + * @throws IOException When there is an error computing segment metadata from the store. + */ + ReplicationCheckpoint computeReplicationCheckpoint(SegmentInfos segmentInfos) throws IOException { + if (segmentInfos == null) { + return ReplicationCheckpoint.empty(shardId); + } + final ReplicationCheckpoint latestReplicationCheckpoint = getLatestReplicationCheckpoint(); + if (latestReplicationCheckpoint.getSegmentInfosVersion() == segmentInfos.getVersion() + && latestReplicationCheckpoint.getSegmentsGen() == segmentInfos.getGeneration()) { + return latestReplicationCheckpoint; + } + final Map metadataMap = store.getSegmentMetadataMap(segmentInfos); + return new ReplicationCheckpoint( + this.shardId, + getOperationPrimaryTerm(), + segmentInfos.getGeneration(), + segmentInfos.getVersion(), + metadataMap.values().stream().mapToLong(StoreFileMetadata::length).sum(), + getEngine().config().getCodec().getName(), + metadataMap + ); } /** diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java index 698e61f6f7a09..c650edc31da8d 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java @@ -181,7 +181,6 @@ private boolean syncSegments() { // in the remote store. return indexShard.state() != IndexShardState.STARTED || !(indexShard.getEngine() instanceof InternalEngine); } - ReplicationCheckpoint checkpoint = indexShard.getLatestReplicationCheckpoint(); beforeSegmentsSync(); long refreshTimeMs = segmentTracker.getLocalRefreshTimeMs(), refreshClockTimeMs = segmentTracker.getLocalRefreshClockTimeMs(); long refreshSeqNo = segmentTracker.getLocalRefreshSeqNo(); @@ -199,10 +198,7 @@ private boolean syncSegments() { try (GatedCloseable segmentInfosGatedCloseable = indexShard.getSegmentInfosSnapshot()) { SegmentInfos segmentInfos = segmentInfosGatedCloseable.get(); - assert segmentInfos.getGeneration() == checkpoint.getSegmentsGen() : "SegmentInfos generation: " - + segmentInfos.getGeneration() - + " does not match metadata generation: " - + checkpoint.getSegmentsGen(); + final ReplicationCheckpoint checkpoint = indexShard.computeReplicationCheckpoint(segmentInfos); // Capture replication checkpoint before uploading the segments as upload can take some time and checkpoint can // move. long lastRefreshedCheckpoint = ((InternalEngine) indexShard.getEngine()).lastRefreshedCheckpoint(); diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java index 5a13f57db2c87..51814283c5eb3 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java @@ -520,8 +520,8 @@ private Tuple mockIn if (counter.incrementAndGet() <= succeedOnAttempt) { throw new RuntimeException("Inducing failure in upload"); } - return indexShard.getLatestSegmentInfosAndCheckpoint(); - })).when(shard).getLatestSegmentInfosAndCheckpoint(); + return indexShard.getLatestReplicationCheckpoint(); + })).when(shard).computeReplicationCheckpoint(any()); doAnswer(invocation -> { if (Objects.nonNull(successLatch)) { diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java index 52f28aead533d..eab38bfe5c64d 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java @@ -925,6 +925,33 @@ public void testSnapshotWhileFailoverIncomplete() throws Exception { } } + public void testReuseReplicationCheckpointWhenLatestInfosIsUnChanged() throws Exception { + try (ReplicationGroup shards = createGroup(1, settings, indexMapping, new NRTReplicationEngineFactory(), createTempDir())) { + final IndexShard primaryShard = shards.getPrimary(); + shards.startAll(); + shards.indexDocs(10); + shards.refresh("test"); + replicateSegments(primaryShard, shards.getReplicas()); + shards.assertAllEqual(10); + final ReplicationCheckpoint latestReplicationCheckpoint = primaryShard.getLatestReplicationCheckpoint(); + try (GatedCloseable segmentInfosSnapshot = primaryShard.getSegmentInfosSnapshot()) { + assertEquals(latestReplicationCheckpoint, primaryShard.computeReplicationCheckpoint(segmentInfosSnapshot.get())); + } + final Tuple, ReplicationCheckpoint> latestSegmentInfosAndCheckpoint = primaryShard + .getLatestSegmentInfosAndCheckpoint(); + try (final GatedCloseable closeable = latestSegmentInfosAndCheckpoint.v1()) { + assertEquals(latestReplicationCheckpoint, primaryShard.computeReplicationCheckpoint(closeable.get())); + } + } + } + + public void testComputeReplicationCheckpointNullInfosReturnsEmptyCheckpoint() throws Exception { + try (ReplicationGroup shards = createGroup(1, settings, indexMapping, new NRTReplicationEngineFactory(), createTempDir())) { + final IndexShard primaryShard = shards.getPrimary(); + assertEquals(ReplicationCheckpoint.empty(primaryShard.shardId), primaryShard.computeReplicationCheckpoint(null)); + } + } + private SnapshotShardsService getSnapshotShardsService(IndexShard replicaShard) { final TransportService transportService = mock(TransportService.class); when(transportService.getThreadPool()).thenReturn(threadPool); From 04180e3adb4c514eb0463237a8b4b25f50fb8e01 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Wed, 18 Oct 2023 14:01:57 -0700 Subject: [PATCH 2/2] add extra log Signed-off-by: Marc Handalian --- .../src/main/java/org/opensearch/index/shard/IndexShard.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index fa29840f1fa8b..5ebfd3863a6cf 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -1669,7 +1669,7 @@ ReplicationCheckpoint computeReplicationCheckpoint(SegmentInfos segmentInfos) th return latestReplicationCheckpoint; } final Map metadataMap = store.getSegmentMetadataMap(segmentInfos); - return new ReplicationCheckpoint( + final ReplicationCheckpoint checkpoint = new ReplicationCheckpoint( this.shardId, getOperationPrimaryTerm(), segmentInfos.getGeneration(), @@ -1678,6 +1678,8 @@ ReplicationCheckpoint computeReplicationCheckpoint(SegmentInfos segmentInfos) th getEngine().config().getCodec().getName(), metadataMap ); + logger.trace("Recomputed ReplicationCheckpoint for shard {}", checkpoint); + return checkpoint; } /**