Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Bulk Scorer For ToParentBlockJoinQuery #13697

Merged
merged 47 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
6e398eb
Added BlockJoinBulkScorer
Mikep86 Aug 26, 2024
79ca811
BlockJoinBulkScorer development
Mikep86 Aug 26, 2024
fbd6ca5
Fix assertion failures
Mikep86 Aug 27, 2024
5d55d19
Added TestBlockJoinBulkScorer
Mikep86 Aug 27, 2024
1f39771
Test development
Mikep86 Aug 27, 2024
7e648ad
Compute expected scores and compare them to actual scores
Mikep86 Aug 28, 2024
8bba0db
Randomize score mode. Score multiple random indices.
Mikep86 Aug 28, 2024
57cb4c0
Filter out empty child docs
Mikep86 Aug 28, 2024
c1dce29
Randomize search score mode
Mikep86 Aug 28, 2024
9be3a20
Updated approach to handle when scoring in multiple batches
Mikep86 Aug 28, 2024
e15105f
Increase test iterations, fix assertion error
Mikep86 Aug 28, 2024
3e57ff9
fix assertion error
Mikep86 Aug 28, 2024
3a1859f
Handle when score supplier is null
Mikep86 Aug 28, 2024
25079b7
Fix min score computation
Mikep86 Aug 28, 2024
8f5a0b0
Add license
Mikep86 Aug 28, 2024
0f93801
Change batching approach
Mikep86 Aug 29, 2024
ec3d967
Remove unnecessary null check
Mikep86 Aug 30, 2024
cfd780e
Scoring computation adjustments
Mikep86 Aug 30, 2024
ac14952
Remove unnecessary scorer null checks
Mikep86 Aug 30, 2024
8786e58
Check that there are no matches when score supplier is null
Mikep86 Aug 30, 2024
4768012
Stop scoring once we've scored the last parent
Mikep86 Aug 30, 2024
2ce93ca
Calculate scores using doubles
Mikep86 Aug 30, 2024
b0dd8cd
Remove currentMin
Mikep86 Aug 30, 2024
bb72ece
Fix test failure
Mikep86 Aug 30, 2024
50e0db7
Simplify scoring reset
Mikep86 Sep 3, 2024
58b9868
Increased bulk scorer test iterations and parent doc count
Mikep86 Sep 3, 2024
ef49457
Updated bulk join scorer to use common score accumulator class
Mikep86 Sep 3, 2024
f26cb70
End scoring early if we've scored the last parent in the bit set
Mikep86 Sep 5, 2024
ab1fe69
Encapsulate ScoreMode.None handling
Mikep86 Sep 5, 2024
e58cb26
Delegate setMinCompetitiveScore call to child scorer
Mikep86 Sep 5, 2024
e83ba94
Resolve TODO
Mikep86 Sep 5, 2024
e19d594
Optimize scoring when score mode is ScoreMode.None
Mikep86 Sep 6, 2024
2ef6c37
change arg order
Mikep86 Sep 6, 2024
ca0cd07
Check if min == max
Mikep86 Sep 6, 2024
51cf4fd
Add dynamic pruning test with score mode set to Max
Mikep86 Sep 6, 2024
f558d23
Add dynamic pruning test with score mode set to None
Mikep86 Sep 6, 2024
d3b7d5c
Updated CHANGES.txt
Mikep86 Sep 6, 2024
9d4cc56
Fix test
Mikep86 Sep 9, 2024
e154b43
Scoring optimizations
Mikep86 Sep 9, 2024
f76db73
Add/improve comments
Mikep86 Sep 9, 2024
12343b7
Move error check into ParentApproximation#advance
Mikep86 Sep 10, 2024
5a993e3
ParentApproximation#advance logic adjustments
Mikep86 Sep 10, 2024
64672fa
Revert ParentApproximation#advance error-checking logic
Mikep86 Sep 11, 2024
f287a3b
Fix test
Mikep86 Sep 11, 2024
a62d87c
Merge branch 'main' into nested-query_bulk-scorer
Mikep86 Sep 11, 2024
5a41c63
Fix build error
Mikep86 Sep 11, 2024
448af12
Improve comment
Mikep86 Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,10 @@ Optimizations

* GITHUB#13587: Use Max WAND optimizations with ToParentBlockJoinQuery when using ScoreMode.Max (Mike Pellegrini)

* GITHUB#13697: Add a bulk scorer to ToParentBlockJoinQuery, which delegates to the bulk scorer of the child query.
This should speed up query evaluation when the child query has a specialized bulk scorer, such as disjunctive queries.
(Mike Pellegrini)

Changes in runtime behavior
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.lucene.search.join;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import static org.apache.lucene.search.ScoreMode.COMPLETE;

import java.io.IOException;
Expand All @@ -24,20 +25,25 @@
import java.util.Locale;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.FilterLeafCollector;
import org.apache.lucene.search.FilterWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesUtils;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;

/**
* This query requires that you index children and parent docs as a single block, using the {@link
Expand Down Expand Up @@ -156,6 +162,18 @@ public Scorer get(long leadCost) throws IOException {
return new BlockJoinScorer(childScorerSupplier.get(leadCost), parents, scoreMode);
}

@Override
public BulkScorer bulkScorer() throws IOException {
if (scoreMode == ScoreMode.None) {
// BlockJoinBulkScorer evaluates all child hits exhaustively, but when scoreMode is None
// we only need to evaluate a single child doc per parent. In this case, use the default
// bulk scorer instead, which uses BlockJoinScorer to iterate over child hits.
// BlockJoinScorer is optimized to skip child hit evaluation when scoreMode is None.
return super.bulkScorer();
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved
}
return new BlockJoinBulkScorer(childScorerSupplier.bulkScorer(), parents, scoreMode);
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
public long cost() {
return childScorerSupplier.cost();
Expand Down Expand Up @@ -224,18 +242,30 @@ public int nextDoc() throws IOException {

@Override
public int advance(int target) throws IOException {
if (target >= parentBits.length()) {
return doc = NO_MORE_DOCS;
}
final int firstChildTarget = target == 0 ? 0 : parentBits.prevSetBit(target - 1) + 1;
final int prevParent =
target == 0 ? -1 : parentBits.prevSetBit(Math.min(target, parentBits.length()) - 1);

int childDoc = childApproximation.docID();
if (childDoc < firstChildTarget) {
childDoc = childApproximation.advance(firstChildTarget);
if (childDoc < prevParent) {
childDoc = childApproximation.advance(prevParent);
} else if (childDoc == -1) {
childDoc = childApproximation.nextDoc();
}

if (childDoc == prevParent) {
throw new IllegalStateException(
"Child query must not match same docs with parent filter. "
+ "Combine them as must clauses (+) to find a problem doc. "
+ "docId="
+ childDoc
+ ", "
+ this.getClass());
}
if (childDoc >= parentBits.length() - 1) {

if (childDoc >= parentBits.length()) {
return doc = NO_MORE_DOCS;
}
return doc = parentBits.nextSetBit(childDoc + 1);
return doc = parentBits.nextSetBit(childDoc);
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
Expand Down Expand Up @@ -275,6 +305,54 @@ public float matchCost() {
}
}

private static class Score extends Scorable {
private final ScoreMode scoreMode;
private double score;
private int freq;

public Score(ScoreMode scoreMode) {
this.scoreMode = scoreMode;
this.score = 0;
this.freq = 0;
}

public void reset(Scorable firstChildScorer) throws IOException {
score = scoreMode == ScoreMode.None ? 0 : firstChildScorer.score();
freq = 1;
}

public void addChildScore(Scorable childScorer) throws IOException {
final float childScore = scoreMode == ScoreMode.None ? 0 : childScorer.score();
freq++;
switch (scoreMode) {
case Total:
case Avg:
score += childScore;
break;
case Min:
score = Math.min(score, childScore);
break;
case Max:
score = Math.max(score, childScore);
break;
case None:
break;
default:
throw new AssertionError();
}
}

@Override
public float score() {
assert freq > 0;
double score = this.score;
if (scoreMode == ScoreMode.Avg) {
score /= freq;
}
return (float) score;
}
}

static class BlockJoinScorer extends Scorer {
private final Scorer childScorer;
private final BitSet parentBits;
Expand All @@ -283,13 +361,14 @@ static class BlockJoinScorer extends Scorer {
private final TwoPhaseIterator childTwoPhase;
private final ParentApproximation parentApproximation;
private final ParentTwoPhase parentTwoPhase;
private float score;
private final Score parentScore;

public BlockJoinScorer(Scorer childScorer, BitSet parentBits, ScoreMode scoreMode) {
// System.out.println("Q.init firstChildDoc=" + firstChildDoc);
this.parentBits = parentBits;
this.childScorer = childScorer;
this.scoreMode = scoreMode;
this.parentScore = new Score(scoreMode);
childTwoPhase = childScorer.twoPhaseIterator();
if (childTwoPhase == null) {
childApproximation = childScorer.iterator();
Expand Down Expand Up @@ -329,8 +408,7 @@ public int docID() {

@Override
public float score() throws IOException {
setScoreAndFreq();
return score;
return scoreChildDocs();
}

@Override
Expand All @@ -348,48 +426,24 @@ public void setMinCompetitiveScore(float minScore) throws IOException {
}
}

private void setScoreAndFreq() throws IOException {
private float scoreChildDocs() throws IOException {
if (childApproximation.docID() >= parentApproximation.docID()) {
return;
return parentScore.score();
}
double score = scoreMode == ScoreMode.None ? 0 : childScorer.score();
int freq = 1;
while (childApproximation.nextDoc() < parentApproximation.docID()) {
if (childTwoPhase == null || childTwoPhase.matches()) {
final float childScore = scoreMode == ScoreMode.None ? 0 : childScorer.score();
freq += 1;
switch (scoreMode) {
case Total:
case Avg:
score += childScore;
break;
case Min:
score = Math.min(score, childScore);
break;
case Max:
score = Math.max(score, childScore);
break;
case None:
break;
default:
throw new AssertionError();

float score = 0;
if (scoreMode != ScoreMode.None) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One issue with not advancing childApproximation is that testNextDocValidationForToParentBjq now fails. I don't see a way to fix this test without advancing the iterator past the child docs. The other option is to remove this test since we can no longer detect that particular edge case when scoreMode == ScoreMode.None. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll have to look at the test case a little closer, but I'd rather we didn't unnecessarily advance to make a test case happy. If there's a much sparser query clause leading a conjunctive iteration, the eager advancing of the child iterator to the next parent could add meaningful overhead. I think anyway?

I'm also not totally sure why we even have that check in the TPBJQ logic. It seems like a slightly odd place to do that check? Maybe there's a different place we could do that check if necessary? Maybe we could do a check in ParentApproximation#advance to validate that the child iterator doesn't provide a doc that's present in the parent bitset? Another alternative might be an assert that eagerly advances to the next parent (like you had)? With an assert, we could bypass the potential performance drain while still getting some validation coverage?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think an assert is an option for two reasons:

  • As the tests in TestBlockJoinValidation demonstrate, the edge cases being tested can happen in production.
  • Using an assert to advance the iterator would introduce a material change to the logic that would not be active in production builds

I will investigate adding this error checking logic to ParentApproximation#advance.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a clear understanding of why it's important to do this error-check in the first place? It's unclear to me at the moment, and I'm a little worried we're bending the production code to a specific test case without understanding whether-or-not this is important (and why). My best guess here is that the check is trying to ensure we don't incorrectly allow parent documents to contribute to the store accumulation if the docs are indexed incorrectly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on my understanding, this error check is important to ensure that parent docs do not match child doc queries. If such a thing were to happen in production, I think (at a minimum) there would be negative scoring implications. Perhaps @jpountz could provide some more background on why these checks were added.

Based on the tests that are written to exercise this error check, I do not believe we are bending production code to fit a test case. From what I see, it's the opposite: the tests are written to simulate a scenario that could happen in production.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did a deep dive into how to avoid advancing childApproximation and the approach as of 5a993e3 is so close to working, except it doesn't when the child approximation isn't exact and a two-phase iterator is required. I will continue investigating how to apply my solution with a two-phase iterator, but given the deadline with FF I think we should prepare for the outcome where we roll back to advancing childApproximation in scoreChildDocs.

I think this is a really interesting optimization and if we run out of time to implement it for Lucene 9.12 & 10, we can do it for a future release.

Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe that we have these checks because Lucene could otherwise fail with confusing errors if the child query matched parent documents, which would suggest a bug in Lucene rather than a problem in the application code.

It's ok to remove some of these checks if they cause issues (these checks are best effort anyway), but I'd err on the side of keeping them when they're cheap and e.g. don't impact the set of docs that we're evaluating.

Keeping the removal of this call to advance() works for me, it's somewhat orthogonal to this PR. I agree that removing this wasteful call to advance() would be nice.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @Mikep86 & @jpountz!

parentScore.reset(childScorer);
while (childApproximation.nextDoc() < parentApproximation.docID()) {
if (childTwoPhase == null || childTwoPhase.matches()) {
parentScore.addChildScore(childScorer);
}
}

score = parentScore.score();
}
if (childApproximation.docID() == parentApproximation.docID()
&& (childTwoPhase == null || childTwoPhase.matches())) {
throw new IllegalStateException(
"Child query must not match same docs with parent filter. "
+ "Combine them as must clauses (+) to find a problem doc. "
+ "docId="
+ parentApproximation.docID()
+ ", "
+ childScorer.getClass());
}
if (scoreMode == ScoreMode.Avg) {
score /= freq;
}
this.score = (float) score;

return score;
}

/*
Expand Down Expand Up @@ -440,6 +494,120 @@ private String formatScoreExplanation(int matches, int start, int end, ScoreMode
}
}

private abstract static class BatchAwareLeafCollector extends FilterLeafCollector {
public BatchAwareLeafCollector(LeafCollector in) {
super(in);
}

public void endBatch() throws IOException {}
}

private static class BlockJoinBulkScorer extends BulkScorer {
private final BulkScorer childBulkScorer;
private final ScoreMode scoreMode;
private final BitSet parents;
private final int parentsLength;

public BlockJoinBulkScorer(BulkScorer childBulkScorer, BitSet parents, ScoreMode scoreMode) {
this.childBulkScorer = childBulkScorer;
this.scoreMode = scoreMode;
this.parents = parents;
this.parentsLength = parents.length();
}

@Override
public int score(LeafCollector collector, Bits acceptDocs, int min, int max)
throws IOException {
if (min == max) {
return scoringCompleteCheck(max, max);
}

// Subtract one because max is exclusive w.r.t. score but inclusive w.r.t prevSetBit
int lastParent = parents.prevSetBit(Math.min(parentsLength, max) - 1);
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved
int prevParent = min == 0 ? -1 : parents.prevSetBit(min - 1);
if (lastParent == prevParent) {
// No parent docs in this range.
return scoringCompleteCheck(max, max);
}
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved

BatchAwareLeafCollector wrappedCollector = wrapCollector(collector);
childBulkScorer.score(wrappedCollector, acceptDocs, prevParent + 1, lastParent + 1);
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved
wrappedCollector.endBatch();

return scoringCompleteCheck(lastParent + 1, max);
}

private int scoringCompleteCheck(int innerMax, int returnedMax) {
// If we've scored the last parent in the bit set, return NO_MORE_DOCS to indicate we are done
// scoring
return innerMax >= parentsLength ? NO_MORE_DOCS : returnedMax;
}

@Override
public long cost() {
return childBulkScorer.cost();
}

private BatchAwareLeafCollector wrapCollector(LeafCollector collector) {
return new BatchAwareLeafCollector(collector) {
private final Score currentParentScore = new Score(scoreMode);
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved
private int currentParent = -1;
private Scorable scorer = null;

@Override
public void setScorer(Scorable scorer) throws IOException {
assert scorer != null;
this.scorer = scorer;

super.setScorer(
new Scorable() {
@Override
public float score() {
return currentParentScore.score();
}

@Override
public void setMinCompetitiveScore(float minScore) throws IOException {
if (scoreMode == ScoreMode.None || scoreMode == ScoreMode.Max) {
scorer.setMinCompetitiveScore(minScore);
}
}
});
Mikep86 marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
public void collect(int doc) throws IOException {
if (doc > currentParent) {
// Emit the current parent and setup scoring for the next parent
if (currentParent >= 0) {
in.collect(currentParent);
}

currentParent = parents.nextSetBit(doc);
currentParentScore.reset(scorer);
} else if (doc == currentParent) {
throw new IllegalStateException(
"Child query must not match same docs with parent filter. "
+ "Combine them as must clauses (+) to find a problem doc. "
+ "docId="
+ doc
+ ", "
+ childBulkScorer.getClass());
} else {
currentParentScore.addChildScore(scorer);
}
}

@Override
public void endBatch() throws IOException {
if (currentParent >= 0) {
in.collect(currentParent);
}
}
};
}
}

@Override
public Query rewrite(IndexSearcher indexSearcher) throws IOException {
final Query childRewrite = childQuery.rewrite(indexSearcher);
Expand Down
Loading
Loading