-
Notifications
You must be signed in to change notification settings - Fork 24.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Keep commits and translog up to the global checkpoint #27606
Changes from 1 commit
ad052ec
97e6cff
48381a2
4b9ebb6
7f32e22
94b735f
ab262b1
38310cd
72a4190
1e3d96b
6b2646b
b7177cd
12cae92
dbdacc4
d07af94
639cac6
d146583
2a6198e
b819a8b
b29623d
60a204f
6b74ab1
d8705b4
7ec417c
6733b75
3a67d4d
53702f9
5644456
dad3a1b
b18725c
09b0f91
ce8b52d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
We need to keep index commits and translog operations up to the current global checkpoint to allow us to throw away unsafe operations and increase the operation-based recovery chance. This is achieved by a new index deletion policy.
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.common.lucene.index; | ||
|
||
import org.apache.lucene.index.IndexCommit; | ||
import org.apache.lucene.index.IndexDeletionPolicy; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
|
||
/** | ||
* An {@link IndexDeletionPolicy} that deletes unneeded index commits, and returns index commits are not deleted by this policy. | ||
*/ | ||
public interface ESIndexDeletionPolicy { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need a separate interface? If we want to know which commits can be kept, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This interface is to avoid keeping translog of snapshotted index commits. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should have documented its purpose. |
||
|
||
/** | ||
* Similar to {@link IndexDeletionPolicy#onInit(List)} but returns a list of kept index commits. | ||
* | ||
* @param commits A list of index commits sorted by age (the 0th one is the oldest commit). | ||
* @return a list of index commits that are not deleted by this policy. | ||
*/ | ||
List<IndexCommit> onInit(List<? extends IndexCommit> commits) throws IOException; | ||
|
||
/** | ||
* Similar to {@link IndexDeletionPolicy#onCommit(List)} but returns a list of kept index commits. | ||
* | ||
* @param commits A list of index commits sorted by age (the 0th one is the oldest commit). | ||
* @return a list of index commits that are not deleted by this policy. | ||
*/ | ||
List<IndexCommit> onCommit(List<? extends IndexCommit> commits) throws IOException; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This interface clearly came from |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -128,7 +128,7 @@ public class InternalEngine extends Engine { | |
|
||
private final String uidField; | ||
|
||
private final CombinedDeletionPolicy deletionPolicy; | ||
private final SnapshotDeletionPolicy snapshotDeletionPolicy; | ||
|
||
// How many callers are currently requesting index throttling. Currently there are only two situations where we do this: when merges | ||
// are falling behind and when writing indexing buffer to disk is too slow. When this is 0, there is no throttling, else we throttling | ||
|
@@ -167,8 +167,6 @@ public InternalEngine(EngineConfig engineConfig) { | |
engineConfig.getIndexSettings().getTranslogRetentionSize().getBytes(), | ||
engineConfig.getIndexSettings().getTranslogRetentionAge().getMillis() | ||
); | ||
this.deletionPolicy = new CombinedDeletionPolicy( | ||
new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()), translogDeletionPolicy, openMode); | ||
store.incRef(); | ||
IndexWriter writer = null; | ||
Translog translog = null; | ||
|
@@ -182,29 +180,13 @@ public InternalEngine(EngineConfig engineConfig) { | |
mergeScheduler = scheduler = new EngineMergeScheduler(engineConfig.getShardId(), engineConfig.getIndexSettings()); | ||
throttle = new IndexThrottle(); | ||
try { | ||
final SeqNoStats seqNoStats; | ||
switch (openMode) { | ||
case OPEN_INDEX_AND_TRANSLOG: | ||
writer = createWriter(false); | ||
final long globalCheckpoint = Translog.readGlobalCheckpoint(engineConfig.getTranslogConfig().getTranslogPath()); | ||
seqNoStats = store.loadSeqNoStats(globalCheckpoint); | ||
break; | ||
case OPEN_INDEX_CREATE_TRANSLOG: | ||
writer = createWriter(false); | ||
seqNoStats = store.loadSeqNoStats(SequenceNumbers.UNASSIGNED_SEQ_NO); | ||
break; | ||
case CREATE_INDEX_AND_TRANSLOG: | ||
writer = createWriter(true); | ||
seqNoStats = new SeqNoStats( | ||
SequenceNumbers.NO_OPS_PERFORMED, | ||
SequenceNumbers.NO_OPS_PERFORMED, | ||
SequenceNumbers.UNASSIGNED_SEQ_NO); | ||
break; | ||
default: | ||
throw new IllegalArgumentException(openMode.toString()); | ||
} | ||
final SeqNoStats seqNoStats = loadSeqNoStats(openMode); | ||
logger.trace("recovered [{}]", seqNoStats); | ||
seqNoService = seqNoServiceSupplier.apply(engineConfig, seqNoStats); | ||
this.seqNoService = seqNoServiceSupplier.apply(engineConfig, seqNoStats); | ||
this.snapshotDeletionPolicy = new SnapshotDeletionPolicy(new CombinedDeletionPolicy( | ||
new KeepUntilGlobalCheckpointDeletionPolicy(seqNoService::getGlobalCheckpoint), translogDeletionPolicy, openMode) | ||
); | ||
writer = createWriter(openMode == EngineConfig.OpenMode.CREATE_INDEX_AND_TRANSLOG); | ||
updateMaxUnsafeAutoIdTimestampFromWriter(writer); | ||
historyUUID = loadOrGenerateHistoryUUID(writer, engineConfig.getForceNewHistoryUUID()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we assert that if we need to generate a new history uuid we always use |
||
Objects.requireNonNull(historyUUID, "history uuid should not be null"); | ||
|
@@ -377,6 +359,28 @@ static SequenceNumbersService sequenceNumberService( | |
seqNoStats.getGlobalCheckpoint()); | ||
} | ||
|
||
private SeqNoStats loadSeqNoStats(EngineConfig.OpenMode openMode) throws IOException { | ||
final SeqNoStats seqNoStats; | ||
switch (openMode) { | ||
case OPEN_INDEX_AND_TRANSLOG: | ||
final long globalCheckpoint = Translog.readGlobalCheckpoint(engineConfig.getTranslogConfig().getTranslogPath()); | ||
seqNoStats = store.loadSeqNoStats(globalCheckpoint); | ||
break; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd just return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
case OPEN_INDEX_CREATE_TRANSLOG: | ||
seqNoStats = store.loadSeqNoStats(SequenceNumbers.UNASSIGNED_SEQ_NO); | ||
break; | ||
case CREATE_INDEX_AND_TRANSLOG: | ||
seqNoStats = new SeqNoStats( | ||
SequenceNumbers.NO_OPS_PERFORMED, | ||
SequenceNumbers.NO_OPS_PERFORMED, | ||
SequenceNumbers.UNASSIGNED_SEQ_NO); | ||
break; | ||
default: | ||
throw new IllegalArgumentException(openMode.toString()); | ||
} | ||
return seqNoStats; | ||
} | ||
|
||
@Override | ||
public InternalEngine recoverFromTranslog() throws IOException { | ||
flushLock.lock(); | ||
|
@@ -1642,7 +1646,7 @@ public IndexCommitRef acquireIndexCommit(final boolean flushFirst) throws Engine | |
} | ||
try (ReleasableLock lock = readLock.acquire()) { | ||
logger.trace("pulling snapshot"); | ||
return new IndexCommitRef(deletionPolicy.getIndexDeletionPolicy()); | ||
return new IndexCommitRef(snapshotDeletionPolicy); | ||
} catch (IOException e) { | ||
throw new SnapshotFailedEngineException(shardId, e); | ||
} | ||
|
@@ -1823,7 +1827,7 @@ private IndexWriterConfig getIndexWriterConfig(boolean create) { | |
final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer()); | ||
iwc.setCommitOnClose(false); // we by default don't commit on close | ||
iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); | ||
iwc.setIndexDeletionPolicy(deletionPolicy); | ||
iwc.setIndexDeletionPolicy(snapshotDeletionPolicy); | ||
// with tests.verbose, lucene sets this up: plumb to align with filesystem stream | ||
boolean verbose = false; | ||
try { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.index.engine; | ||
|
||
import org.apache.lucene.index.IndexCommit; | ||
import org.elasticsearch.common.lucene.index.ESIndexDeletionPolicy; | ||
import org.elasticsearch.index.seqno.SequenceNumbers; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.LongSupplier; | ||
|
||
/** | ||
* An {@link ESIndexDeletionPolicy} that deletes index commits that are not required for recovery. | ||
* In particular, this policy will delete index commits whose max sequence number is smaller (or equal) than | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
* the current global checkpoint except the index commit which has the highest max sequence number among those. | ||
*/ | ||
final class KeepUntilGlobalCheckpointDeletionPolicy implements ESIndexDeletionPolicy { | ||
private final LongSupplier globalCheckpointSupplier; | ||
|
||
KeepUntilGlobalCheckpointDeletionPolicy(LongSupplier globalCheckpointSupplier) { | ||
this.globalCheckpointSupplier = globalCheckpointSupplier; | ||
} | ||
|
||
@Override | ||
public List<IndexCommit> onInit(List<? extends IndexCommit> commits) throws IOException { | ||
if (commits.isEmpty()) { | ||
return Collections.emptyList(); | ||
} | ||
return onCommit(commits); | ||
} | ||
|
||
@Override | ||
public List<IndexCommit> onCommit(List<? extends IndexCommit> commits) throws IOException { | ||
assert commits.isEmpty() == false : "onCommit() must be called with a non-empty list of commits"; | ||
|
||
final List<IndexCommit> keptCommits = new ArrayList<>(); | ||
final int keptPosition = indexOfKeptCommits(commits); | ||
final List<Integer> duplicateIndexes = indexesOfDuplicateCommits(commits); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We only call this collection's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We expect to have zero or one element in a list, there should be no difference. However, using Set makes more sense here; I updated. Thank you. |
||
|
||
for (int i = 0; i < commits.size() - 1; i++) { | ||
final IndexCommit commit = commits.get(i); | ||
if (i < keptPosition || duplicateIndexes.contains(i)) { | ||
commit.delete(); | ||
} else { | ||
keptCommits.add(commit); | ||
} | ||
} | ||
keptCommits.add(commits.get(commits.size() - 1)); // Always keep the last commit. | ||
|
||
assert keptCommits.stream().allMatch(c -> c.isDeleted() == false) : "All kept commits must not be deleted"; | ||
return keptCommits; | ||
} | ||
|
||
/** | ||
* Find the index position of a safe index commit whose max sequence number is not greater than the global checkpoint. | ||
*/ | ||
private int indexOfKeptCommits(List<? extends IndexCommit> commits) throws IOException { | ||
final long currentGlobalCheckpoint = globalCheckpointSupplier.getAsLong(); | ||
|
||
// Commits are sorted by age (the 0th one is the oldest commit). | ||
for (int i = commits.size() - 1; i >= 0; i--) { | ||
final Map<String, String> commitUserData = commits.get(i).getUserData(); | ||
// 5.x commits do not contain MAX_SEQ_NO. | ||
if (commitUserData.containsKey(SequenceNumbers.MAX_SEQ_NO) == false) { | ||
return i; | ||
} | ||
final long maxSeqNoFromCommit = Long.parseLong(commitUserData.get(SequenceNumbers.MAX_SEQ_NO)); | ||
if (maxSeqNoFromCommit <= currentGlobalCheckpoint) { | ||
return i; | ||
} | ||
} | ||
/* | ||
* We may reach to this point in these cases: | ||
* 1. In the previous 6.x, we keep only the last commit - which is likely not a safe commit if writes are in progress. | ||
* Thus, after upgrading, we may not find a safe commit until we can reserve one. | ||
* 2. In peer-recovery, if the file-based happens, a replica will be received the latest commit from a primary. | ||
* However, that commit may not be a safe commit if writes are in progress in the primary. | ||
*/ | ||
return -1; | ||
} | ||
|
||
/** | ||
* In some cases, we may have more than one index commits with the same max sequence number. | ||
* We better scan and delete these duplicate index commits as soon as possible. | ||
* | ||
* @return index positions of duplicate commits. | ||
*/ | ||
private List<Integer> indexesOfDuplicateCommits(List<? extends IndexCommit> commits) throws IOException { | ||
final List<Integer> duplicateEntries = new ArrayList<>(); | ||
long lastMaxSeqNo = Long.MIN_VALUE; | ||
for (int i = commits.size() - 1; i >= 0; i--) { | ||
final Map<String, String> commitUserData = commits.get(i).getUserData(); | ||
// 5.x commits do not contain MAX_SEQ_NO. | ||
if (commitUserData.containsKey(SequenceNumbers.MAX_SEQ_NO)) { | ||
final long maxSeqNoFromCommit = Long.parseLong(commitUserData.get(SequenceNumbers.MAX_SEQ_NO)); | ||
if (lastMaxSeqNo == maxSeqNoFromCommit) { | ||
duplicateEntries.add(i); | ||
} | ||
lastMaxSeqNo = maxSeqNoFromCommit; | ||
} | ||
} | ||
return duplicateEntries; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not an
IndexDeletionPolicy
any more!