Skip to content

Commit

Permalink
Fix primary term deletion logic for RemoteFsTimestampAwareTranslog
Browse files Browse the repository at this point in the history
Signed-off-by: Sachin Kale <kalsac@amazon.com>
  • Loading branch information
Sachin Kale committed Sep 4, 2024
1 parent efe0860 commit edb726c
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 149 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,8 @@ public Directory newDirectory(String repositoryName, String indexUUID, ShardId s
}
}

public Supplier<RepositoriesService> getRepositoriesService() {
return this.repositoriesService;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

package org.opensearch.index.translog;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.opensearch.action.LatchedActionListener;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.blobstore.BlobMetadata;
import org.opensearch.common.collect.Tuple;
Expand All @@ -33,6 +35,9 @@
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.BooleanSupplier;
import java.util.function.LongConsumer;
import java.util.function.LongSupplier;
Expand All @@ -52,11 +57,13 @@
*/
public class RemoteFsTimestampAwareTranslog extends RemoteFsTranslog {

private static Logger staticLogger = LogManager.getLogger(RemoteFsTimestampAwareTranslog.class);
private final Logger logger;
private final Map<Long, String> metadataFilePinnedTimestampMap;
// For metadata files, with no min generation in the name, we cache generation data to avoid multiple reads.
private final Map<String, Tuple<Long, Long>> oldFormatMetadataFileGenerationMap;
private final Map<String, Tuple<Long, Long>> oldFormatMetadataFilePrimaryTermMap;
private final AtomicLong minPrimaryTermInRemote = new AtomicLong(Long.MAX_VALUE);

public RemoteFsTimestampAwareTranslog(
TranslogConfig config,
Expand Down Expand Up @@ -167,7 +174,11 @@ public void onResponse(List<BlobMetadata> blobMetadata) {
return;
}

List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles);
List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(
metadataFiles,
metadataFilePinnedTimestampMap,
logger
);

// If index is not deleted, make sure to keep latest metadata file
if (indexDeleted == false) {
Expand All @@ -186,41 +197,35 @@ public void onResponse(List<BlobMetadata> blobMetadata) {
metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted);

logger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted);
Map<Long, Set<Long>> generationsToBeDeletedToPrimaryTermRangeMap = getGenerationsToBeDeleted(
Set<Long> generationsToBeDeleted = getGenerationsToBeDeleted(
metadataFilesNotToBeDeleted,
metadataFilesToBeDeleted,
indexDeleted
);

logger.debug(() -> "generationsToBeDeletedToPrimaryTermRangeMap = " + generationsToBeDeletedToPrimaryTermRangeMap);
if (generationsToBeDeletedToPrimaryTermRangeMap.isEmpty() == false) {
logger.debug(() -> "generationsToBeDeleted = " + generationsToBeDeleted);
if (generationsToBeDeleted.isEmpty() == false) {
// Delete stale generations
Map<Long, Set<Long>> primaryTermToGenerationsMap = getPrimaryTermToGenerationsMap(
generationsToBeDeletedToPrimaryTermRangeMap
);
translogTransferManager.deleteGenerationAsync(
primaryTermToGenerationsMap,
primaryTermSupplier.getAsLong(),
generationsToBeDeleted,
remoteGenerationDeletionPermits::release
);
} else {
remoteGenerationDeletionPermits.release();
}

if (metadataFilesToBeDeleted.isEmpty() == false) {
// Delete stale metadata files
translogTransferManager.deleteMetadataFilesAsync(
metadataFilesToBeDeleted,
remoteGenerationDeletionPermits::release
);
} else {
remoteGenerationDeletionPermits.release();
}

// Update cache to keep only those metadata files that are not getting deleted
oldFormatMetadataFileGenerationMap.keySet().retainAll(metadataFilesNotToBeDeleted);
// Update cache to keep only those metadata files that are not getting deleted
oldFormatMetadataFileGenerationMap.keySet().retainAll(metadataFilesNotToBeDeleted);

// Delete stale primary terms
deleteStaleRemotePrimaryTerms(metadataFiles);
// Delete stale primary terms
deleteStaleRemotePrimaryTerms(metadataFilesNotToBeDeleted);
} else {
remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS);
}
} catch (Exception e) {
remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS);
}
Expand All @@ -235,21 +240,8 @@ public void onFailure(Exception e) {
translogTransferManager.listTranslogMetadataFilesAsync(listMetadataFilesListener);
}

protected Map<Long, Set<Long>> getPrimaryTermToGenerationsMap(Map<Long, Set<Long>> generationsToBeDeletedToPrimaryTermRangeMap) {
Map<Long, Set<Long>> primaryTermToGenerationsMap = new HashMap<>();
for (Map.Entry<Long, Set<Long>> entry : generationsToBeDeletedToPrimaryTermRangeMap.entrySet()) {
for (Long primaryTerm : entry.getValue()) {
if (primaryTermToGenerationsMap.containsKey(primaryTerm) == false) {
primaryTermToGenerationsMap.put(primaryTerm, new HashSet<>());
}
primaryTermToGenerationsMap.get(primaryTerm).add(entry.getKey());
}
}
return primaryTermToGenerationsMap;
}

// Visible for testing
protected Map<Long, Set<Long>> getGenerationsToBeDeleted(
protected Set<Long> getGenerationsToBeDeleted(
List<String> metadataFilesNotToBeDeleted,
List<String> metadataFilesToBeDeleted,
boolean indexDeleted
Expand All @@ -260,60 +252,36 @@ protected Map<Long, Set<Long>> getGenerationsToBeDeleted(
maxGenerationToBeDeleted = minRemoteGenReferenced - 1 - indexSettings().getRemoteTranslogExtraKeep();
}

Map<Long, Set<String>> generationsFromMetadataFilesToBeDeleted = new HashMap<>();
Set<Long> generationsFromMetadataFilesToBeDeleted = new HashSet<>();
for (String mdFile : metadataFilesToBeDeleted) {
Tuple<Long, Long> minMaxGen = getMinMaxTranslogGenerationFromMetadataFile(mdFile, translogTransferManager);
List<Long> generations = LongStream.rangeClosed(minMaxGen.v1(), minMaxGen.v2()).boxed().collect(Collectors.toList());
for (Long generation : generations) {
if (generationsFromMetadataFilesToBeDeleted.containsKey(generation) == false) {
generationsFromMetadataFilesToBeDeleted.put(generation, new HashSet<>());
}
generationsFromMetadataFilesToBeDeleted.get(generation).add(mdFile);
}
}

for (String mdFile : metadataFilesNotToBeDeleted) {
Tuple<Long, Long> minMaxGen = getMinMaxTranslogGenerationFromMetadataFile(mdFile, translogTransferManager);
List<Long> generations = LongStream.rangeClosed(minMaxGen.v1(), minMaxGen.v2()).boxed().collect(Collectors.toList());
for (Long generation : generations) {
if (generationsFromMetadataFilesToBeDeleted.containsKey(generation)) {
generationsFromMetadataFilesToBeDeleted.get(generation).add(mdFile);
}
}
generationsFromMetadataFilesToBeDeleted.addAll(
LongStream.rangeClosed(minMaxGen.v1(), minMaxGen.v2()).boxed().collect(Collectors.toList())
);
}

Map<String, Tuple<Long, Long>> metadataFileNotToBeDeletedGenerationMap = getGenerationForMetadataFiles(metadataFilesNotToBeDeleted);
TreeSet<Tuple<Long, Long>> pinnedGenerations = getOrderedPinnedMetadataGenerations(metadataFileNotToBeDeletedGenerationMap);
Map<Long, Set<Long>> generationsToBeDeletedToPrimaryTermRangeMap = new HashMap<>();
for (long generation : generationsFromMetadataFilesToBeDeleted.keySet()) {
Set<Long> generationsToBeDeleted = new HashSet<>();
for (long generation : generationsFromMetadataFilesToBeDeleted) {
// Check if the generation is not referred by metadata file matching pinned timestamps
if (generation <= maxGenerationToBeDeleted && isGenerationPinned(generation, pinnedGenerations) == false) {
generationsToBeDeletedToPrimaryTermRangeMap.put(
generation,
getPrimaryTermRange(generationsFromMetadataFilesToBeDeleted.get(generation), translogTransferManager)
);
generationsToBeDeleted.add(generation);
}
}
return generationsToBeDeletedToPrimaryTermRangeMap;
return generationsToBeDeleted;
}

protected Set<Long> getPrimaryTermRange(Set<String> metadataFiles, TranslogTransferManager translogTransferManager) throws IOException {
Tuple<Long, Long> primaryTermRange = new Tuple<>(Long.MIN_VALUE, Long.MAX_VALUE);
for (String metadataFile : metadataFiles) {
Tuple<Long, Long> primaryTermRangeForMdFile = getMinMaxPrimaryTermFromMetadataFile(metadataFile, translogTransferManager);
primaryTermRange = new Tuple<>(
Math.max(primaryTermRange.v1(), primaryTermRangeForMdFile.v1()),
Math.min(primaryTermRange.v2(), primaryTermRangeForMdFile.v2())
);
if (primaryTermRange.v1().equals(primaryTermRange.v2())) {
break;
}
}
return LongStream.rangeClosed(primaryTermRange.v1(), primaryTermRange.v2()).boxed().collect(Collectors.toSet());
protected List<String> getMetadataFilesToBeDeleted(List<String> metadataFiles) {
return getMetadataFilesToBeDeleted(metadataFiles, metadataFilePinnedTimestampMap, logger);
}

// Visible for testing
protected List<String> getMetadataFilesToBeDeleted(List<String> metadataFiles) {
protected static List<String> getMetadataFilesToBeDeleted(
List<String> metadataFiles,
Map<Long, String> metadataFilePinnedTimestampMap,
Logger logger
) {
Tuple<Long, Set<Long>> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps();

// Keep files since last successful run of scheduler
Expand Down Expand Up @@ -404,8 +372,94 @@ protected Tuple<Long, Long> getMinMaxTranslogGenerationFromMetadataFile(
}
}

protected Tuple<Long, Long> getMinMaxPrimaryTermFromMetadataFile(String metadataFile, TranslogTransferManager translogTransferManager)
throws IOException {
private void deleteStaleRemotePrimaryTerms(List<String> metadataFiles) {
deleteStaleRemotePrimaryTerms(
metadataFiles,
translogTransferManager,
oldFormatMetadataFilePrimaryTermMap,
minPrimaryTermInRemote,
logger
);
}

/**
* This method must be called only after there are valid generations to delete in trimUnreferencedReaders as it ensures
* implicitly that minimum primary term in latest translog metadata in remote store is the current primary term.
* <br>
* This will also delete all stale translog metadata files from remote except the latest basis the metadata file comparator.
*/
private static void deleteStaleRemotePrimaryTerms(
List<String> metadataFiles,
TranslogTransferManager translogTransferManager,
Map<String, Tuple<Long, Long>> oldFormatMetadataFilePrimaryTermMap,
AtomicLong minPrimaryTermInRemote,
Logger logger
) {
// The deletion of older translog files in remote store is on best-effort basis, there is a possibility that there
// are older files that are no longer needed and should be cleaned up. In here, we delete all files that are part
// of older primary term.
if (metadataFiles.isEmpty()) {
logger.trace("No metadata is uploaded yet, returning from deleteStaleRemotePrimaryTerms");
return;
}
Optional<Long> minPrimaryTermFromMetadataFiles = metadataFiles.stream().map(file -> {
try {
return getMinMaxPrimaryTermFromMetadataFile(file, translogTransferManager, oldFormatMetadataFilePrimaryTermMap).v1();
} catch (IOException e) {
return Long.MAX_VALUE;
}
}).min(Long::compareTo);
// First we delete all stale primary terms folders from remote store
long minimumReferencedPrimaryTerm = minPrimaryTermFromMetadataFiles.get() - 1;
Long minPrimaryTerm = getMinPrimaryTermInRemote(minPrimaryTermInRemote, translogTransferManager, logger);
if (minimumReferencedPrimaryTerm > minPrimaryTerm) {
translogTransferManager.deletePrimaryTermsAsync(minimumReferencedPrimaryTerm);
minPrimaryTermInRemote.set(minimumReferencedPrimaryTerm);
} else {
logger.debug(
"Skipping primary term cleanup. minimumReferencedPrimaryTerm = {}, minPrimaryTermInRemote = {}",
minimumReferencedPrimaryTerm,
minPrimaryTermInRemote
);
}
}

private static Long getMinPrimaryTermInRemote(
AtomicLong minPrimaryTermInRemote,
TranslogTransferManager translogTransferManager,
Logger logger
) {
if (minPrimaryTermInRemote.get() == Long.MAX_VALUE) {
CountDownLatch latch = new CountDownLatch(1);
translogTransferManager.listPrimaryTermsInRemoteAsync(new LatchedActionListener<>(new ActionListener<>() {
@Override
public void onResponse(Set<Long> primaryTermsInRemote) {
Optional<Long> minPrimaryTerm = primaryTermsInRemote.stream().min(Long::compareTo);
minPrimaryTerm.ifPresent(minPrimaryTermInRemote::set);
}

@Override
public void onFailure(Exception e) {
logger.error("Exception while fetching min primary term from remote translog", e);
}
}, latch));

try {
if (latch.await(5, TimeUnit.MINUTES) == false) {
logger.error("Timeout while fetching min primary term from remote translog");
}
} catch (InterruptedException e) {
logger.error("Exception while fetching min primary term from remote translog", e);
}
}
return minPrimaryTermInRemote.get();
}

protected static Tuple<Long, Long> getMinMaxPrimaryTermFromMetadataFile(
String metadataFile,
TranslogTransferManager translogTransferManager,
Map<String, Tuple<Long, Long>> oldFormatMetadataFilePrimaryTermMap
) throws IOException {
Tuple<Long, Long> minMaxPrimaryTermFromFileName = TranslogTransferMetadata.getMinMaxPrimaryTermFromFilename(metadataFile);
if (minMaxPrimaryTermFromFileName != null) {
return minMaxPrimaryTermFromFileName;
Expand Down Expand Up @@ -434,27 +488,51 @@ protected Tuple<Long, Long> getMinMaxPrimaryTermFromMetadataFile(String metadata
}
}

/**
* This method must be called only after there are valid generations to delete in trimUnreferencedReaders as it ensures
* implicitly that minimum primary term in latest translog metadata in remote store is the current primary term.
* <br>
* This will also delete all stale translog metadata files from remote except the latest basis the metadata file comparator.
*/
private void deleteStaleRemotePrimaryTerms(List<String> metadataFiles) {
// The deletion of older translog files in remote store is on best-effort basis, there is a possibility that there
// are older files that are no longer needed and should be cleaned up. In here, we delete all files that are part
// of older primary term.
if (olderPrimaryCleaned.trySet(Boolean.TRUE)) {
if (metadataFiles.isEmpty()) {
logger.trace("No metadata is uploaded yet, returning from deleteStaleRemotePrimaryTerms");
return;
public static void cleanup(TranslogTransferManager translogTransferManager) throws IOException {
ActionListener<List<BlobMetadata>> listMetadataFilesListener = new ActionListener<>() {
@Override
public void onResponse(List<BlobMetadata> blobMetadata) {
List<String> metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList());

try {
if (metadataFiles.isEmpty()) {
staticLogger.debug("No stale translog metadata files found");
return;
}
List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles, new HashMap<>(), staticLogger);
if (metadataFilesToBeDeleted.isEmpty()) {
staticLogger.debug("No metadata files to delete");
return;
}
staticLogger.debug(() -> "metadataFilesToBeDeleted = " + metadataFilesToBeDeleted);

// For all the files that we are keeping, fetch min and max generations
List<String> metadataFilesNotToBeDeleted = new ArrayList<>(metadataFiles);
metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted);
staticLogger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted);

// Delete stale metadata files
translogTransferManager.deleteMetadataFilesAsync(
metadataFilesToBeDeleted,
// Delete stale primary terms
() -> deleteStaleRemotePrimaryTerms(
metadataFilesNotToBeDeleted,
translogTransferManager,
new HashMap<>(),
new AtomicLong(Long.MAX_VALUE),
staticLogger
)
);
} catch (Exception e) {
staticLogger.error("Exception while cleaning up metadata and primary terms", e);
}
}
Optional<Long> minPrimaryTerm = metadataFiles.stream()
.map(file -> RemoteStoreUtils.invertLong(file.split(METADATA_SEPARATOR)[1]))
.min(Long::compareTo);
// First we delete all stale primary terms folders from remote store
long minimumReferencedPrimaryTerm = minPrimaryTerm.get() - 1;
translogTransferManager.deletePrimaryTermsAsync(minimumReferencedPrimaryTerm);
}

@Override
public void onFailure(Exception e) {
staticLogger.error("Exception while cleaning up metadata and primary terms", e);
}
};
translogTransferManager.listTranslogMetadataFilesAsync(listMetadataFilesListener);
}
}
Loading

0 comments on commit edb726c

Please sign in to comment.