-
Notifications
You must be signed in to change notification settings - Fork 1.3k
LUCENE-10311: remove complex cost estimation and abstraction leakage around it #709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d668ca2
4c6b436
ce172ec
c64ee66
ceb5e30
5d1fbf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,6 @@ | |
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
| import org.apache.lucene.index.PointValues; | ||
| import org.apache.lucene.index.Terms; | ||
| import org.apache.lucene.search.DocIdSet; | ||
| import org.apache.lucene.search.DocIdSetIterator; | ||
| import org.apache.lucene.util.packed.PackedInts; | ||
|
|
@@ -100,52 +98,17 @@ public void add(int doc) { | |
|
|
||
| private final int maxDoc; | ||
| private final int threshold; | ||
| // pkg-private for testing | ||
| final boolean multivalued; | ||
| final double numValuesPerDoc; | ||
|
|
||
| private List<Buffer> buffers = new ArrayList<>(); | ||
| private int totalAllocated; // accumulated size of the allocated buffers | ||
|
|
||
| private FixedBitSet bitSet; | ||
|
|
||
| private long counter = -1; | ||
| private BulkAdder adder; | ||
|
|
||
| /** Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}. */ | ||
| public DocIdSetBuilder(int maxDoc) { | ||
| this(maxDoc, -1, -1); | ||
| } | ||
|
|
||
| /** | ||
| * Create a {@link DocIdSetBuilder} instance that is optimized for accumulating docs that match | ||
| * the given {@link Terms}. | ||
| */ | ||
| public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException { | ||
| this(maxDoc, terms.getDocCount(), terms.getSumDocFreq()); | ||
| } | ||
|
|
||
| /** | ||
| * Create a {@link DocIdSetBuilder} instance that is optimized for accumulating docs that match | ||
| * the given {@link PointValues}. | ||
| */ | ||
| public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException { | ||
| this(maxDoc, values.getDocCount(), values.size()); | ||
| } | ||
|
|
||
| DocIdSetBuilder(int maxDoc, int docCount, long valueCount) { | ||
| this.maxDoc = maxDoc; | ||
| this.multivalued = docCount < 0 || docCount != valueCount; | ||
| if (docCount <= 0 || valueCount < 0) { | ||
| // assume one value per doc, this means the cost will be overestimated | ||
| // if the docs are actually multi-valued | ||
| this.numValuesPerDoc = 1; | ||
| } else { | ||
| // otherwise compute from index stats | ||
| this.numValuesPerDoc = (double) valueCount / docCount; | ||
| } | ||
|
|
||
| assert numValuesPerDoc >= 1 : "valueCount=" + valueCount + " docCount=" + docCount; | ||
|
|
||
| // For ridiculously small sets, we'll just use a sorted int[] | ||
| // maxDoc >>> 7 is a good value if you want to save memory, lower values | ||
|
|
@@ -190,10 +153,8 @@ public BulkAdder grow(int numDocs) { | |
| ensureBufferCapacity(numDocs); | ||
| } else { | ||
| upgradeToBitSet(); | ||
| counter += numDocs; | ||
| } | ||
| } else { | ||
| counter += numDocs; | ||
| } | ||
| return adder; | ||
| } | ||
|
|
@@ -247,17 +208,14 @@ private void growBuffer(Buffer buffer, int additionalCapacity) { | |
| private void upgradeToBitSet() { | ||
| assert bitSet == null; | ||
| FixedBitSet bitSet = new FixedBitSet(maxDoc); | ||
| long counter = 0; | ||
| for (Buffer buffer : buffers) { | ||
| int[] array = buffer.array; | ||
| int length = buffer.length; | ||
| counter += length; | ||
| for (int i = 0; i < length; ++i) { | ||
| bitSet.set(array[i]); | ||
| } | ||
| } | ||
| this.bitSet = bitSet; | ||
| this.counter = counter; | ||
| this.buffers = null; | ||
| this.adder = new FixedBitSetAdder(bitSet); | ||
| } | ||
|
|
@@ -266,20 +224,12 @@ private void upgradeToBitSet() { | |
| public DocIdSet build() { | ||
| try { | ||
| if (bitSet != null) { | ||
| assert counter >= 0; | ||
| final long cost = Math.round(counter / numValuesPerDoc); | ||
| return new BitDocIdSet(bitSet, cost); | ||
| return new BitDocIdSet(bitSet); | ||
| } else { | ||
| Buffer concatenated = concat(buffers); | ||
| LSBRadixSorter sorter = new LSBRadixSorter(); | ||
| sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length); | ||
| final int l; | ||
| if (multivalued) { | ||
| l = dedup(concatenated.array, concatenated.length); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really want to throw away this optimisation? we normally know if our data is single or multi-valued so it seems wasteful not to exploit it.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This optimization doesnt make sense to me. Buffers should only be used for tiny sets (they are very memory expensive).
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I am convinced. Thanks! |
||
| } else { | ||
| assert noDups(concatenated.array, concatenated.length); | ||
| l = concatenated.length; | ||
| } | ||
| final int l = dedup(concatenated.array, concatenated.length); | ||
| assert l <= concatenated.length; | ||
| concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS; | ||
| return new IntArrayDocIdSet(concatenated.array, l); | ||
|
|
@@ -336,11 +286,4 @@ private static int dedup(int[] arr, int length) { | |
| } | ||
| return l; | ||
| } | ||
|
|
||
| private static boolean noDups(int[] a, int len) { | ||
| for (int i = 1; i < len; ++i) { | ||
| assert a[i - 1] < a[i]; | ||
| } | ||
| return true; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we still ned to implement the method estimateCardinality which is the hard bit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it is difficult, it just requires a little work. I can get to it soon, seems like it should be fun. Ultimately I think it will give us better estimations than what we have today, without all the tangled APIs and abstraction leakage.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like the idea of sampling, thanks