Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Break point estimate when threshold exceeded #13199

Merged
merged 7 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ Optimizations
(Ben Trent)
* GITHUB#13184: Make the HitQueue size more appropriate for KNN exact search (Pan Guixin)

* GITHUB#13199: Speed up dynamic pruning by breaking point estimation when threshold get exceeded. (Guo Feng)

Bug Fixes
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -473,11 +473,10 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
};

final long currentQueryCost = Math.min(leadCost, it.cost());
final long threshold = currentQueryCost >>> 3;
long estimatedNumberOfMatches =
pointValues.estimatePointCount(visitor); // runs in O(log(numPoints))
// TODO: what is the right factor compared to the current disi? Is 8 optimal?
if (estimatedNumberOfMatches >= threshold) {
final long threshold = currentQueryCost >>> 3;
if (PointValues.isEstimatedPointCountGreaterThanOrEqualTo(
visitor, pointValues.getPointTree(), threshold)) {
// the new range is not selective enough to be worth materializing
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -436,11 +436,10 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
};

final long currentQueryCost = Math.min(leadCost, it.cost());
final long threshold = currentQueryCost >>> 3;
long estimatedNumberOfMatches =
pointValues.estimatePointCount(visitor); // runs in O(log(numPoints))
// TODO: what is the right factor compared to the current disi? Is 8 optimal?
if (estimatedNumberOfMatches >= threshold) {
final long threshold = currentQueryCost >>> 3;
if (PointValues.isEstimatedPointCountGreaterThanOrEqualTo(
visitor, pointValues.getPointTree(), threshold)) {
// the new range is not selective enough to be worth materializing
return;
}
Expand Down
28 changes: 23 additions & 5 deletions lucene/core/src/java/org/apache/lucene/index/PointValues.java
Original file line number Diff line number Diff line change
Expand Up @@ -375,16 +375,34 @@ private void intersect(IntersectVisitor visitor, PointTree pointTree) throws IOE
public final long estimatePointCount(IntersectVisitor visitor) {
try {
final PointTree pointTree = getPointTree();
final long count = estimatePointCount(visitor, pointTree);
final long count = estimatePointCount(visitor, pointTree, Long.MAX_VALUE);
assert pointTree.moveToParent() == false;
return count;
} catch (IOException ioe) {
throw new UncheckedIOException(ioe);
}
}

private long estimatePointCount(IntersectVisitor visitor, PointTree pointTree)
throws IOException {
/**
* Estimate if the point count that would be matched by {@link #intersect} with the given {@link
* IntersectVisitor} is greater than or equal to the upperBound.
*
* @lucene.internal
*/
public static boolean isEstimatedPointCountGreaterThanOrEqualTo(
IntersectVisitor visitor, PointTree pointTree, long upperBound) throws IOException {
return estimatePointCount(visitor, pointTree, upperBound) >= upperBound;
}

/**
* Estimate the number of documents that would be matched by {@link #intersect} with the given
* {@link IntersectVisitor}. The estimation will terminate when the point count gets greater than
* or equal to the upper bound.
*
* <p>TODO: will broad-first help estimation terminate earlier?
*/
private static long estimatePointCount(
IntersectVisitor visitor, PointTree pointTree, long upperBound) throws IOException {
Relation r = visitor.compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue());
switch (r) {
case CELL_OUTSIDE_QUERY:
Expand All @@ -398,8 +416,8 @@ private long estimatePointCount(IntersectVisitor visitor, PointTree pointTree)
if (pointTree.moveToChild()) {
long cost = 0;
do {
cost += estimatePointCount(visitor, pointTree);
} while (pointTree.moveToSibling());
cost += estimatePointCount(visitor, pointTree, upperBound - cost);
} while (cost < upperBound && pointTree.moveToSibling());
pointTree.moveToParent();
return cost;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ public abstract class NumericLeafComparator implements LeafFieldComparator {
private final LeafReaderContext context;
protected final NumericDocValues docValues;
private final PointValues pointValues;
private final PointValues.PointTree pointTree;
// if skipping functionality should be enabled on this segment
private final boolean enableSkipping;
private final int maxDoc;
Expand Down Expand Up @@ -129,10 +130,12 @@ public NumericLeafComparator(LeafReaderContext context) throws IOException {
+ " expected "
+ bytesCount);
}
this.pointTree = pointValues.getPointTree();
this.enableSkipping = true; // skipping is enabled when points are available
this.maxDoc = context.reader().maxDoc();
this.competitiveIterator = DocIdSetIterator.all(maxDoc);
} else {
this.pointTree = null;
this.enableSkipping = false;
this.maxDoc = 0;
}
Expand Down Expand Up @@ -282,9 +285,8 @@ public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue
}
};
final long threshold = iteratorCost >>> 3;
long estimatedNumberOfMatches =
pointValues.estimatePointCount(visitor); // runs in O(log(numPoints))
if (estimatedNumberOfMatches >= threshold) {

if (PointValues.isEstimatedPointCountGreaterThanOrEqualTo(visitor, pointTree, threshold)) {
// the new range is not selective enough to be worth materializing, it doesn't reduce number
// of docs at least 8x
updateSkipInterval(false);
Expand Down