-
Notifications
You must be signed in to change notification settings - Fork 24.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Restore from Individual Shard Snapshot Files in Parallel #48110
Changes from all commits
3639a6f
acacf4f
0c4a128
a4beb48
1fd86af
5fb8169
eb6f102
9deb3cd
711c43a
af00465
b4d6d88
19c8f35
3d16462
13bf860
17cf524
eaf9761
51e7c34
4b28df3
22b2306
852efe3
77b6ab2
f5e18c7
cea3e85
087aeeb
3bb0683
ea7f6ce
6ce0808
4613489
1cf0f13
7e4406e
9707a4b
c48183f
f273373
fc4694b
a21ad9b
94feeca
8f8f747
253e198
889f334
0971b46
80bf095
6082460
3ef5686
86db9c7
a8989cc
b781b5b
c880b39
215609e
6630105
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1195,11 +1195,7 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s | |
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT); | ||
// Start as many workers as fit into the snapshot pool at once at the most | ||
final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(), indexIncrementalFileCount); | ||
final ActionListener<Void> filesListener = ActionListener.delegateResponse( | ||
new GroupedActionListener<>(allFilesUploadedListener, workers), (l, e) -> { | ||
filesToSnapshot.clear(); // Stop uploading the remaining files if we run into any exception | ||
l.onFailure(e); | ||
}); | ||
final ActionListener<Void> filesListener = fileQueueListener(filesToSnapshot, workers, allFilesUploadedListener); | ||
for (int i = 0; i < workers; ++i) { | ||
executor.execute(ActionRunnable.run(filesListener, () -> { | ||
BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS); | ||
|
@@ -1223,19 +1219,42 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s | |
|
||
@Override | ||
public void restoreShard(Store store, SnapshotId snapshotId, IndexId indexId, ShardId snapshotShardId, | ||
RecoveryState recoveryState) { | ||
ShardId shardId = store.shardId(); | ||
try { | ||
final BlobContainer container = shardContainer(indexId, snapshotShardId); | ||
BlobStoreIndexShardSnapshot snapshot = loadShardSnapshot(container, snapshotId); | ||
SnapshotFiles snapshotFiles = new SnapshotFiles(snapshot.snapshot(), snapshot.indexFiles()); | ||
RecoveryState recoveryState, ActionListener<Void> listener) { | ||
final ShardId shardId = store.shardId(); | ||
final ActionListener<Void> restoreListener = ActionListener.delegateResponse(listener, | ||
(l, e) -> l.onFailure(new IndexShardRestoreFailedException(shardId, "failed to restore snapshot [" + snapshotId + "]", e))); | ||
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT); | ||
final BlobContainer container = shardContainer(indexId, snapshotShardId); | ||
executor.execute(ActionRunnable.wrap(restoreListener, l -> { | ||
final BlobStoreIndexShardSnapshot snapshot = loadShardSnapshot(container, snapshotId); | ||
final SnapshotFiles snapshotFiles = new SnapshotFiles(snapshot.snapshot(), snapshot.indexFiles()); | ||
new FileRestoreContext(metadata.name(), shardId, snapshotId, recoveryState) { | ||
@Override | ||
protected void restoreFiles(List<BlobStoreIndexShardSnapshot.FileInfo> filesToRecover, Store store) throws IOException { | ||
// restore the files from the snapshot to the Lucene store | ||
for (final BlobStoreIndexShardSnapshot.FileInfo fileToRecover : filesToRecover) { | ||
logger.trace("[{}] [{}] restoring file [{}]", shardId, snapshotId, fileToRecover.name()); | ||
restoreFile(fileToRecover, store); | ||
protected void restoreFiles(List<BlobStoreIndexShardSnapshot.FileInfo> filesToRecover, Store store, | ||
ActionListener<Void> listener) { | ||
if (filesToRecover.isEmpty()) { | ||
listener.onResponse(null); | ||
} else { | ||
// Start as many workers as fit into the snapshot pool at once at the most | ||
final int workers = | ||
Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(), snapshotFiles.indexFiles().size()); | ||
final BlockingQueue<BlobStoreIndexShardSnapshot.FileInfo> files = new LinkedBlockingQueue<>(filesToRecover); | ||
final ActionListener<Void> allFilesListener = | ||
fileQueueListener(files, workers, ActionListener.map(listener, v -> null)); | ||
// restore the files from the snapshot to the Lucene store | ||
for (int i = 0; i < workers; ++i) { | ||
executor.execute(ActionRunnable.run(allFilesListener, () -> { | ||
store.incRef(); | ||
try { | ||
BlobStoreIndexShardSnapshot.FileInfo fileToRecover; | ||
while ((fileToRecover = files.poll(0L, TimeUnit.MILLISECONDS)) != null) { | ||
restoreFile(fileToRecover, store); | ||
} | ||
} finally { | ||
store.decRef(); | ||
} | ||
})); | ||
} | ||
} | ||
} | ||
|
||
|
@@ -1275,10 +1294,16 @@ protected InputStream openSlice(long slice) throws IOException { | |
} | ||
} | ||
} | ||
}.restore(snapshotFiles, store); | ||
} catch (Exception e) { | ||
throw new IndexShardRestoreFailedException(shardId, "failed to restore snapshot [" + snapshotId + "]", e); | ||
} | ||
}.restore(snapshotFiles, store, l); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand the changes around There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found some significant simplifications here in #48173 when investigating this for this PR but that was pretty much all the simplification I could find (it does remove incorrectly shared code between blobstore and CCR repo). Actually, with that change the separation between CCR and blobstore code should be "correct" in theory as all the code in the base class is now used across both implementations. I think I'd "simply" make the CCR logic properly async in a follow-up as that would simplify it overall as well but figured for now keeping CCR code as is and blocking around the changed API was the way to go to not mix things into this PR. |
||
})); | ||
} | ||
|
||
private static ActionListener<Void> fileQueueListener(BlockingQueue<BlobStoreIndexShardSnapshot.FileInfo> files, int workers, | ||
ActionListener<Collection<Void>> listener) { | ||
return ActionListener.delegateResponse(new GroupedActionListener<>(listener, workers), (l, e) -> { | ||
files.clear(); // Stop uploading the remaining files if we run into any exception | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For extra safety can we call try {
files.clear(); // Stop uploading the remaining files if we run into any exception
} finally {
l.onFailure(e);
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I shouldn't be necessary I think. If we run into an exception/throwable here the node is probably done for anyway? (the only way I see us getting here is some OOM, otherwise |
||
l.onFailure(e); | ||
}); | ||
} | ||
|
||
private static InputStream maybeRateLimit(InputStream stream, @Nullable RateLimiter rateLimiter, CounterMetric metric) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not very fair to other actions running on the node. This is essentially blocking the snapshot threadpool until all files have been restored for the shard. Imagine a concurrent snapshot (in the future). We should avoid blocking threadpools for very long actions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair point, but to some degree blocking the threadpool here is by design so we restore shard-by-shard as efficiently as possible. Maybe it would be better to move this action onto the generic pool in a follow-up and allow configuring the restore parallelism independent from snapshot parallelism?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess the question when it comes to parallel operations is more general anyway. Doesn't it boil down to what trade-offs we want to make about the point in time of creating index commits?
If we don't do these long running operations per-shard then we'll have a better chance of quickly having a thread available for taking index-commits but the individual commits will live for longer. If we absolutely don't care about quickly taking index commits I think the current approach is best. If we strongly care, we could move taking index commits to the generic pool (but even if we did that we'd have to bound their number ...).
-> until we decide on a strategy for scheduling the steps in parallel snapshots creation this seems like the best approach since it's fastest end-to-end isn't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok