Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
LatLonPoint.checkCompatible(fieldInfo);

// matching docids
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
final IntersectVisitor visitor = getIntersectVisitor(result);

final Weight weight = this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ public long cost() {
} else {
return new ScorerSupplier() {

final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
final IntersectVisitor visitor = getIntersectVisitor(result);
long cost = -1;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ && hasAnyHits(spatialVisitor, queryRelation, values) == false) {
return null;
}
// walk the tree to get matching documents
return new RelationScorerSupplier(values, spatialVisitor, queryRelation, field) {
return new RelationScorerSupplier(values, spatialVisitor, queryRelation) {
@Override
public Scorer get(long leadCost) throws IOException {
return getScorer(reader, weight, score(), scoreMode);
Expand Down Expand Up @@ -252,18 +252,15 @@ private abstract static class RelationScorerSupplier extends ScorerSupplier {
private final PointValues values;
private final SpatialVisitor spatialVisitor;
private final QueryRelation queryRelation;
private final String field;
private long cost = -1;

RelationScorerSupplier(
final PointValues values,
SpatialVisitor spatialVisitor,
final QueryRelation queryRelation,
final String field) {
final QueryRelation queryRelation) {
this.values = values;
this.spatialVisitor = spatialVisitor;
this.queryRelation = queryRelation;
this.field = field;
}

protected Scorer getScorer(
Expand Down Expand Up @@ -311,7 +308,7 @@ && cost() > reader.maxDoc() / 2) {
cost[0] == 0 ? DocIdSetIterator.empty() : new BitSetIterator(result, cost[0]);
return new ConstantScoreScorer(weight, boost, scoreMode, iterator);
} else {
final DocIdSetBuilder docIdSetBuilder = new DocIdSetBuilder(reader.maxDoc(), values, field);
final DocIdSetBuilder docIdSetBuilder = new DocIdSetBuilder(reader.maxDoc());
values.intersect(getSparseVisitor(spatialVisitor, queryRelation, docIdSetBuilder));
final DocIdSetIterator iterator = docIdSetBuilder.build().iterator();
return new ConstantScoreScorer(weight, boost, scoreMode, iterator);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
return new ScorerSupplier() {

long cost = -1;
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
final IntersectVisitor visitor = getIntersectVisitor(result, tree);

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
}

// Too many terms: go back to the terms we already collected and start building the bit set
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
if (collectedTerms.isEmpty() == false) {
TermsEnum termsEnum2 = terms.iterator();
for (TermAndState t : collectedTerms) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
+ bytesPerDim);
}

DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());

if (numDims == 1) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ public long cost() {
} else {
return new ScorerSupplier() {

final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
final IntersectVisitor visitor = getIntersectVisitor(result);
long cost = -1;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
matchingTerms.add(new TermAndState(field, termsEnum));
} else {
assert matchingTerms.size() == threshold;
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
builder = new DocIdSetBuilder(reader.maxDoc());
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
for (TermAndState t : matchingTerms) {
Expand Down
61 changes: 2 additions & 59 deletions lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.packed.PackedInts;
Expand Down Expand Up @@ -100,52 +98,17 @@ public void add(int doc) {

private final int maxDoc;
private final int threshold;
// pkg-private for testing
final boolean multivalued;
final double numValuesPerDoc;

private List<Buffer> buffers = new ArrayList<>();
private int totalAllocated; // accumulated size of the allocated buffers

private FixedBitSet bitSet;

private long counter = -1;
private BulkAdder adder;

/** Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}. */
public DocIdSetBuilder(int maxDoc) {
this(maxDoc, -1, -1);
}

/**
* Create a {@link DocIdSetBuilder} instance that is optimized for accumulating docs that match
* the given {@link Terms}.
*/
public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException {
this(maxDoc, terms.getDocCount(), terms.getSumDocFreq());
}

/**
* Create a {@link DocIdSetBuilder} instance that is optimized for accumulating docs that match
* the given {@link PointValues}.
*/
public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException {
this(maxDoc, values.getDocCount(), values.size());
}

DocIdSetBuilder(int maxDoc, int docCount, long valueCount) {
this.maxDoc = maxDoc;
this.multivalued = docCount < 0 || docCount != valueCount;
if (docCount <= 0 || valueCount < 0) {
// assume one value per doc, this means the cost will be overestimated
// if the docs are actually multi-valued
this.numValuesPerDoc = 1;
} else {
// otherwise compute from index stats
this.numValuesPerDoc = (double) valueCount / docCount;
}

assert numValuesPerDoc >= 1 : "valueCount=" + valueCount + " docCount=" + docCount;

// For ridiculously small sets, we'll just use a sorted int[]
// maxDoc >>> 7 is a good value if you want to save memory, lower values
Expand Down Expand Up @@ -190,10 +153,8 @@ public BulkAdder grow(int numDocs) {
ensureBufferCapacity(numDocs);
} else {
upgradeToBitSet();
counter += numDocs;
}
} else {
counter += numDocs;
}
return adder;
}
Expand Down Expand Up @@ -247,17 +208,14 @@ private void growBuffer(Buffer buffer, int additionalCapacity) {
private void upgradeToBitSet() {
assert bitSet == null;
FixedBitSet bitSet = new FixedBitSet(maxDoc);
long counter = 0;
for (Buffer buffer : buffers) {
int[] array = buffer.array;
int length = buffer.length;
counter += length;
for (int i = 0; i < length; ++i) {
bitSet.set(array[i]);
}
}
this.bitSet = bitSet;
this.counter = counter;
this.buffers = null;
this.adder = new FixedBitSetAdder(bitSet);
}
Expand All @@ -266,20 +224,12 @@ private void upgradeToBitSet() {
public DocIdSet build() {
try {
if (bitSet != null) {
assert counter >= 0;
final long cost = Math.round(counter / numValuesPerDoc);
return new BitDocIdSet(bitSet, cost);
return new BitDocIdSet(bitSet);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we still ned to implement the method estimateCardinality which is the hard bit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is difficult, it just requires a little work. I can get to it soon, seems like it should be fun. Ultimately I think it will give us better estimations than what we have today, without all the tangled APIs and abstraction leakage.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea of sampling, thanks

} else {
Buffer concatenated = concat(buffers);
LSBRadixSorter sorter = new LSBRadixSorter();
sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length);
final int l;
if (multivalued) {
l = dedup(concatenated.array, concatenated.length);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really want to throw away this optimisation? we normally know if our data is single or multi-valued so it seems wasteful not to exploit it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This optimization doesnt make sense to me. Buffers should only be used for tiny sets (they are very memory expensive).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I am convinced. Thanks!

} else {
assert noDups(concatenated.array, concatenated.length);
l = concatenated.length;
}
final int l = dedup(concatenated.array, concatenated.length);
assert l <= concatenated.length;
concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS;
return new IntArrayDocIdSet(concatenated.array, l);
Expand Down Expand Up @@ -336,11 +286,4 @@ private static int dedup(int[] arr, int length) {
}
return l;
}

private static boolean noDups(int[] a, int len) {
for (int i = 1; i < len; ++i) {
assert a[i - 1] < a[i];
}
return true;
}
}
Loading