diff --git a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceFeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceFeatureQuery.java index 8508cb137611..6661b0503fb4 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceFeatureQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceFeatureQuery.java @@ -38,8 +38,8 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; -import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.PointsDocIdSetBuilder; import org.apache.lucene.util.SloppyMath; final class LatLonPointDistanceFeatureQuery extends Query { @@ -398,12 +398,12 @@ public void setMinCompetitiveScore(float minScore) throws IOException { NumericUtils.intToSortableBytes(GeoEncodingUtils.encodeLongitude(box.minLon), minLon, 0); NumericUtils.intToSortableBytes(GeoEncodingUtils.encodeLongitude(box.maxLon), maxLon, 0); - DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); + PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(maxDoc, pointValues); final int doc = docID(); IntersectVisitor visitor = new IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { diff --git a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java index 4adf74529131..3d372774c9bb 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java @@ -44,9 +44,9 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitSetIterator; -import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.PointsDocIdSetBuilder; /** Distance query for {@link LatLonPoint}. */ final class LatLonPointDistanceQuery extends Query { @@ -148,7 +148,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti LatLonPoint.checkCompatible(fieldInfo); // matching docids - DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); + PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(reader.maxDoc(), values); final IntersectVisitor visitor = getIntersectVisitor(result); final Weight weight = this; @@ -241,10 +241,10 @@ private Relation relate(byte[] minPackedValue, byte[] maxPackedValue) { } /** Create a visitor that collects documents matching the range. */ - private IntersectVisitor getIntersectVisitor(DocIdSetBuilder result) { + private IntersectVisitor getIntersectVisitor(PointsDocIdSetBuilder result) { return new IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { diff --git a/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java index c9e3a6ec6260..e66f020349ce 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java @@ -35,7 +35,7 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PointsDocIdSetBuilder; final class LongDistanceFeatureQuery extends Query { @@ -384,12 +384,12 @@ public void setMinCompetitiveScore(float minScore) throws IOException { final byte[] maxValueAsBytes = new byte[Long.BYTES]; LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0); - DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); + PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(maxDoc, pointValues); final int doc = docID(); IntersectVisitor visitor = new IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { diff --git a/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java b/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java index ab83969de6f7..58c880dde8bc 100644 --- a/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java @@ -35,7 +35,7 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; -import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PointsDocIdSetBuilder; /** * Query class for searching {@code RangeField} types by a defined {@link Relation}. @@ -452,9 +452,9 @@ public final Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, fl throws IOException { return new ConstantScoreWeight(this, boost) { - private IntersectVisitor getIntersectVisitor(DocIdSetBuilder result) { + private IntersectVisitor getIntersectVisitor(PointsDocIdSetBuilder result) { return new IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { @@ -538,7 +538,7 @@ public long cost() { } else { return new ScorerSupplier() { - final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); + final PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(reader.maxDoc(), values); final IntersectVisitor visitor = getIntersectVisitor(result); long cost = -1; diff --git a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java index c6170ecad626..de4b6ef0157c 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java @@ -41,8 +41,8 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; import org.apache.lucene.util.BitSetIterator; -import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.PointsDocIdSetBuilder; /** * Base query class for all spatial geometries: {@link LatLonShape}, {@link LatLonPoint} and {@link @@ -187,7 +187,7 @@ && hasAnyHits(spatialVisitor, queryRelation, values) == false) { return null; } // walk the tree to get matching documents - return new RelationScorerSupplier(values, spatialVisitor, queryRelation, field) { + return new RelationScorerSupplier(values, spatialVisitor, queryRelation) { @Override public Scorer get(long leadCost) throws IOException { return getScorer(reader, weight, score(), scoreMode); @@ -252,18 +252,15 @@ private abstract static class RelationScorerSupplier extends ScorerSupplier { private final PointValues values; private final SpatialVisitor spatialVisitor; private final QueryRelation queryRelation; - private final String field; private long cost = -1; RelationScorerSupplier( final PointValues values, SpatialVisitor spatialVisitor, - final QueryRelation queryRelation, - final String field) { + final QueryRelation queryRelation) { this.values = values; this.spatialVisitor = spatialVisitor; this.queryRelation = queryRelation; - this.field = field; } protected Scorer getScorer( @@ -311,7 +308,8 @@ && cost() > reader.maxDoc() / 2) { cost[0] == 0 ? DocIdSetIterator.empty() : new BitSetIterator(result, cost[0]); return new ConstantScoreScorer(weight, boost, scoreMode, iterator); } else { - final DocIdSetBuilder docIdSetBuilder = new DocIdSetBuilder(reader.maxDoc(), values, field); + final PointsDocIdSetBuilder docIdSetBuilder = + new PointsDocIdSetBuilder(reader.maxDoc(), values); values.intersect(getSparseVisitor(spatialVisitor, queryRelation, docIdSetBuilder)); final DocIdSetIterator iterator = docIdSetBuilder.build().iterator(); return new ConstantScoreScorer(weight, boost, scoreMode, iterator); @@ -405,12 +403,12 @@ public Relation compare(byte[] minTriangle, byte[] maxTriangle) { private static IntersectVisitor getSparseVisitor( final SpatialVisitor spatialVisitor, QueryRelation queryRelation, - final DocIdSetBuilder result) { + final PointsDocIdSetBuilder result) { final BiFunction innerFunction = spatialVisitor.getInnerFunction(queryRelation); final Predicate leafPredicate = spatialVisitor.getLeafPredicate(queryRelation); return new IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { diff --git a/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java b/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java index 1533463a2734..84abe8db0d47 100644 --- a/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java @@ -37,7 +37,7 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; -import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PointsDocIdSetBuilder; /** * Finds all previously indexed points that fall within the specified XY geometries. @@ -71,9 +71,9 @@ public void visit(QueryVisitor visitor) { } } - private IntersectVisitor getIntersectVisitor(DocIdSetBuilder result, Component2D tree) { + private IntersectVisitor getIntersectVisitor(PointsDocIdSetBuilder result, Component2D tree) { return new IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { @@ -149,7 +149,7 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti return new ScorerSupplier() { long cost = -1; - DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); + PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(reader.maxDoc(), values); final IntersectVisitor visitor = getIntersectVisitor(result, tree); @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java index a49072794dda..a27bced38e28 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java @@ -36,7 +36,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefIterator; -import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PointsDocIdSetBuilder; import org.apache.lucene.util.RamUsageEstimator; /** @@ -173,7 +173,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { + bytesPerDim); } - DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); + PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(reader.maxDoc(), values); if (numDims == 1) { @@ -211,14 +211,14 @@ public boolean isCacheable(LeafReaderContext ctx) { */ private class MergePointVisitor implements IntersectVisitor { - private final DocIdSetBuilder result; + private final PointsDocIdSetBuilder result; private TermIterator iterator; private BytesRef nextQueryPoint; private final BytesRef scratch = new BytesRef(); private final PrefixCodedTerms sortedPackedPoints; - private DocIdSetBuilder.BulkAdder adder; + private PointsDocIdSetBuilder.BulkAdder adder; - public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, DocIdSetBuilder result) + public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, PointsDocIdSetBuilder result) throws IOException { this.result = result; this.sortedPackedPoints = sortedPackedPoints; @@ -315,11 +315,11 @@ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { private class SinglePointVisitor implements IntersectVisitor { private final ByteArrayComparator comparator; - private final DocIdSetBuilder result; + private final PointsDocIdSetBuilder result; private final byte[] pointBytes; - private DocIdSetBuilder.BulkAdder adder; + private PointsDocIdSetBuilder.BulkAdder adder; - public SinglePointVisitor(DocIdSetBuilder result) { + public SinglePointVisitor(PointsDocIdSetBuilder result) { this.comparator = ArrayUtil.getUnsignedComparator(bytesPerDim); this.result = result; this.pointBytes = new byte[bytesPerDim * numDims]; diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index 567104a50e39..dc5db35cafe2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -30,8 +30,8 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; import org.apache.lucene.util.BitSetIterator; -import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.PointsDocIdSetBuilder; /** * Abstract class for range queries against single or multidimensional points such as {@link @@ -165,10 +165,10 @@ private Relation relate(byte[] minPackedValue, byte[] maxPackedValue) { } } - private IntersectVisitor getIntersectVisitor(DocIdSetBuilder result) { + private IntersectVisitor getIntersectVisitor(PointsDocIdSetBuilder result) { return new IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { @@ -331,7 +331,7 @@ public long cost() { } else { return new ScorerSupplier() { - final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); + final PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(reader.maxDoc(), values); final IntersectVisitor visitor = getIntersectVisitor(result); long cost = -1; diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java index a44583fcebac..3d93a9863d65 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java @@ -30,7 +30,7 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; -import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PointsDocIdSetBuilder; /** * Abstract numeric comparator for comparing numeric values. This comparator provides a skipping @@ -204,10 +204,10 @@ private void updateCompetitiveIterator() throws IOException { } } - DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); + PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(maxDoc, pointValues); PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { diff --git a/lucene/core/src/java/org/apache/lucene/util/Buffers.java b/lucene/core/src/java/org/apache/lucene/util/Buffers.java new file mode 100644 index 000000000000..efac0a3f44a4 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/Buffers.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.packed.PackedInts; + +/** Sparse structure to collect docIds. The structure will grow until the provided threshold. */ +class Buffers { + + private static class Buffer { + int[] array; + int length; + + Buffer(int length) { + this.array = new int[length]; + this.length = 0; + } + + Buffer(int[] array, int length) { + this.array = array; + this.length = length; + } + } + + private final List buffers; + private final int threshold; + private final int maxDoc; + private int totalAllocated; // accumulated size of the allocated buffers + private Buffer current; + // pkg-private for testing + final boolean multiValued; + + Buffers(int maxDoc, boolean multiValued) { + this.maxDoc = maxDoc; + this.multiValued = multiValued; + this.buffers = new ArrayList<>(); + // For ridiculously small sets, we'll just use a sorted int[] + // maxDoc >>> 7 is a good value if you want to save memory, lower values + // such as maxDoc >>> 11 should provide faster building but at the expense + // of using a full bitset even for quite sparse data + this.threshold = maxDoc >>> 7; + } + + /** add doc to the buffer structure */ + public void addDoc(int doc) { + current.array[current.length++] = doc; + } + + /** return true if the buffer can allocate numDocs, otherwise false */ + public boolean ensureBufferCapacity(int numDocs) { + if ((long) totalAllocated + numDocs > threshold) { + return false; + } + if (buffers.isEmpty()) { + addBuffer(additionalCapacity(numDocs)); + } else if (current.array.length - current.length >= numDocs) { + // current buffer is large enough + } else if (current.length < current.array.length - (current.array.length >>> 3)) { + // current buffer is less than 7/8 full, resize rather than waste space + growBuffer(current, additionalCapacity(numDocs)); + } else { + addBuffer(additionalCapacity(numDocs)); + } + return true; + } + + /** + * Fill the provided BitSet with the current dics. Returns the total number of docs this structure + * is holding. + */ + public long toBitSet(BitSet bitSet) { + long counter = 0; + for (Buffer buffer : buffers) { + int[] array = buffer.array; + int length = buffer.length; + counter += length; + for (int i = 0; i < length; ++i) { + bitSet.set(array[i]); + } + } + return counter; + } + + /** Build a {@link DocIdSet} with the documents in the data structure. */ + public DocIdSet toDocIdSet() { + Buffer concatenated = concat(buffers); + LSBRadixSorter sorter = new LSBRadixSorter(); + sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length); + final int l; + if (multiValued) { + l = dedup(concatenated.array, concatenated.length); + } else { + assert noDups(concatenated.array, concatenated.length); + l = concatenated.length; + } + assert l <= concatenated.length; + concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS; + return new IntArrayDocIdSet(concatenated.array, l); + } + + /** + * Concatenate the buffers in any order, leaving at least one empty slot in the end NOTE: this + * method might reuse one of the arrays + */ + private static Buffer concat(List buffers) { + int totalLength = 0; + Buffer largestBuffer = null; + for (Buffer buffer : buffers) { + totalLength += buffer.length; + if (largestBuffer == null || buffer.array.length > largestBuffer.array.length) { + largestBuffer = buffer; + } + } + if (largestBuffer == null) { + return new Buffer(1); + } + int[] docs = largestBuffer.array; + if (docs.length < totalLength + 1) { + docs = ArrayUtil.growExact(docs, totalLength + 1); + } + totalLength = largestBuffer.length; + for (Buffer buffer : buffers) { + if (buffer != largestBuffer) { + System.arraycopy(buffer.array, 0, docs, totalLength, buffer.length); + totalLength += buffer.length; + } + } + return new Buffer(docs, totalLength); + } + + private static boolean noDups(int[] a, int len) { + for (int i = 1; i < len; ++i) { + assert a[i - 1] < a[i]; + } + return true; + } + + private Buffer addBuffer(int len) { + Buffer buffer = new Buffer(len); + buffers.add(buffer); + this.current = buffer; + totalAllocated += buffer.length; + return buffer; + } + + private void growBuffer(Buffer buffer, int additionalCapacity) { + buffer.array = ArrayUtil.growExact(buffer.array, buffer.length + additionalCapacity); + totalAllocated += additionalCapacity; + } + + private int additionalCapacity(int numDocs) { + // exponential growth: the new array has a size equal to the sum of what + // has been allocated so far + int c = totalAllocated; + // but is also >= numDocs + 1 so that we can store the next batch of docs + // (plus an empty slot so that we are more likely to reuse the array in build()) + c = Math.max(numDocs + 1, c); + // avoid cold starts + c = Math.max(32, c); + // do not go beyond the threshold + c = Math.min(threshold - totalAllocated, c); + return c; + } + + private static int dedup(int[] arr, int length) { + if (length == 0) { + return 0; + } + int l = 1; + int previous = arr[0]; + for (int i = 1; i < length; ++i) { + final int value = arr[i]; + assert value >= previous; + if (value != previous) { + arr[l++] = value; + previous = value; + } + } + return l; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java index 67b3dde9f200..6757bca7e247 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java @@ -17,100 +17,32 @@ package org.apache.lucene.util; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import java.util.function.IntConsumer; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.Terms; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.packed.PackedInts; /** - * A builder of {@link DocIdSet}s. At first it uses a sparse structure to gather documents, and then - * upgrades to a non-sparse bit set once enough hits match. + * A builder of {@link DocIdSet}s for Terms. At first it uses a sparse structure to gather + * documents, and then upgrades to a non-sparse bit set once enough hits match. * - *

To add documents, you first need to call {@link #grow} in order to reserve space, and then - * call {@link BulkAdder#add(int)} on the returned {@link BulkAdder}. + *

Documents are added via {@link #add(DocIdSetIterator)} as a bulk or via {@link #add(int)} for + * individual documents. + * + *

See {@link PointsDocIdSetBuilder} if you are not working with {@link PointValues} * * @lucene.internal */ public final class DocIdSetBuilder { - /** - * Utility class to efficiently add many docs in one go. - * - * @see DocIdSetBuilder#grow - */ - public abstract static class BulkAdder { - public abstract void add(int doc); - - public void add(DocIdSetIterator iterator) throws IOException { - int docID; - while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - add(docID); - } - } - } - - private static class FixedBitSetAdder extends BulkAdder { - final FixedBitSet bitSet; - - FixedBitSetAdder(FixedBitSet bitSet) { - this.bitSet = bitSet; - } - - @Override - public void add(int doc) { - bitSet.set(doc); - } - - @Override - public void add(DocIdSetIterator iterator) throws IOException { - bitSet.or(iterator); - } - } - - private static class Buffer { - int[] array; - int length; - - Buffer(int length) { - this.array = new int[length]; - this.length = 0; - } - - Buffer(int[] array, int length) { - this.array = array; - this.length = length; - } - } - - private static class BufferAdder extends BulkAdder { - final Buffer buffer; - - BufferAdder(Buffer buffer) { - this.buffer = buffer; - } - - @Override - public void add(int doc) { - buffer.array[buffer.length++] = doc; - } - } - private final int maxDoc; - private final int threshold; // pkg-private for testing - final boolean multivalued; final double numValuesPerDoc; - - private List buffers = new ArrayList<>(); - private int totalAllocated; // accumulated size of the allocated buffers - + Buffers buffers; private FixedBitSet bitSet; - + private IntConsumer adder; private long counter = -1; - private BulkAdder adder; /** Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}. */ public DocIdSetBuilder(int maxDoc) { @@ -125,17 +57,9 @@ public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException { this(maxDoc, terms.getDocCount(), terms.getSumDocFreq()); } - /** - * Create a {@link DocIdSetBuilder} instance that is optimized for accumulating docs that match - * the given {@link PointValues}. - */ - public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException { - this(maxDoc, values.getDocCount(), values.size()); - } - - DocIdSetBuilder(int maxDoc, int docCount, long valueCount) { + private DocIdSetBuilder(int maxDoc, int docCount, long valueCount) { this.maxDoc = maxDoc; - this.multivalued = docCount < 0 || docCount != valueCount; + final boolean multivalued = docCount < 0 || docCount != valueCount; if (docCount <= 0 || valueCount < 0) { // assume one value per doc, this means the cost will be overestimated // if the docs are actually multi-valued @@ -144,16 +68,10 @@ public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOEx // otherwise compute from index stats this.numValuesPerDoc = (double) valueCount / docCount; } - - assert numValuesPerDoc >= 1 : "valueCount=" + valueCount + " docCount=" + docCount; - - // For ridiculously small sets, we'll just use a sorted int[] - // maxDoc >>> 7 is a good value if you want to save memory, lower values - // such as maxDoc >>> 11 should provide faster building but at the expense - // of using a full bitset even for quite sparse data - this.threshold = maxDoc >>> 7; - + assert numValuesPerDoc >= 1 : "valueCount=" + docCount + " docCount=" + valueCount; + this.buffers = new Buffers(maxDoc, multivalued); this.bitSet = null; + this.adder = doc -> buffers.addDoc(doc); } /** @@ -167,99 +85,44 @@ public void add(DocIdSetIterator iter) throws IOException { return; } int cost = (int) Math.min(Integer.MAX_VALUE, iter.cost()); - BulkAdder adder = grow(cost); + grow(cost); for (int i = 0; i < cost; ++i) { int doc = iter.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { return; } - adder.add(doc); + adder.accept(doc); } for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { - grow(1).add(doc); + add(doc); } } - /** - * Reserve space and return a {@link BulkAdder} object that can be used to add up to {@code - * numDocs} documents. - */ - public BulkAdder grow(int numDocs) { + /** Add a single document to this builder. */ + public void add(int doc) { + grow(1); + adder.accept(doc); + } + + /** Reserve space and up to {@code numDocs} documents. */ + private void grow(int numDocs) { if (bitSet == null) { - if ((long) totalAllocated + numDocs <= threshold) { - ensureBufferCapacity(numDocs); - } else { + if (buffers.ensureBufferCapacity(numDocs) == false) { upgradeToBitSet(); counter += numDocs; } } else { counter += numDocs; } - return adder; - } - - private void ensureBufferCapacity(int numDocs) { - if (buffers.isEmpty()) { - addBuffer(additionalCapacity(numDocs)); - return; - } - - Buffer current = buffers.get(buffers.size() - 1); - if (current.array.length - current.length >= numDocs) { - // current buffer is large enough - return; - } - if (current.length < current.array.length - (current.array.length >>> 3)) { - // current buffer is less than 7/8 full, resize rather than waste space - growBuffer(current, additionalCapacity(numDocs)); - } else { - addBuffer(additionalCapacity(numDocs)); - } - } - - private int additionalCapacity(int numDocs) { - // exponential growth: the new array has a size equal to the sum of what - // has been allocated so far - int c = totalAllocated; - // but is also >= numDocs + 1 so that we can store the next batch of docs - // (plus an empty slot so that we are more likely to reuse the array in build()) - c = Math.max(numDocs + 1, c); - // avoid cold starts - c = Math.max(32, c); - // do not go beyond the threshold - c = Math.min(threshold - totalAllocated, c); - return c; - } - - private Buffer addBuffer(int len) { - Buffer buffer = new Buffer(len); - buffers.add(buffer); - adder = new BufferAdder(buffer); - totalAllocated += buffer.array.length; - return buffer; - } - - private void growBuffer(Buffer buffer, int additionalCapacity) { - buffer.array = ArrayUtil.growExact(buffer.array, buffer.array.length + additionalCapacity); - totalAllocated += additionalCapacity; } private void upgradeToBitSet() { assert bitSet == null; FixedBitSet bitSet = new FixedBitSet(maxDoc); - long counter = 0; - for (Buffer buffer : buffers) { - int[] array = buffer.array; - int length = buffer.length; - counter += length; - for (int i = 0; i < length; ++i) { - bitSet.set(array[i]); - } - } + this.counter = buffers.toBitSet(bitSet); this.bitSet = bitSet; - this.counter = counter; + adder = doc -> bitSet.set(doc); this.buffers = null; - this.adder = new FixedBitSetAdder(bitSet); } /** Build a {@link DocIdSet} from the accumulated doc IDs. */ @@ -270,77 +133,11 @@ public DocIdSet build() { final long cost = Math.round(counter / numValuesPerDoc); return new BitDocIdSet(bitSet, cost); } else { - Buffer concatenated = concat(buffers); - LSBRadixSorter sorter = new LSBRadixSorter(); - sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length); - final int l; - if (multivalued) { - l = dedup(concatenated.array, concatenated.length); - } else { - assert noDups(concatenated.array, concatenated.length); - l = concatenated.length; - } - assert l <= concatenated.length; - concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS; - return new IntArrayDocIdSet(concatenated.array, l); + return buffers.toDocIdSet(); } } finally { this.buffers = null; this.bitSet = null; } } - - /** - * Concatenate the buffers in any order, leaving at least one empty slot in the end NOTE: this - * method might reuse one of the arrays - */ - private static Buffer concat(List buffers) { - int totalLength = 0; - Buffer largestBuffer = null; - for (Buffer buffer : buffers) { - totalLength += buffer.length; - if (largestBuffer == null || buffer.array.length > largestBuffer.array.length) { - largestBuffer = buffer; - } - } - if (largestBuffer == null) { - return new Buffer(1); - } - int[] docs = largestBuffer.array; - if (docs.length < totalLength + 1) { - docs = ArrayUtil.growExact(docs, totalLength + 1); - } - totalLength = largestBuffer.length; - for (Buffer buffer : buffers) { - if (buffer != largestBuffer) { - System.arraycopy(buffer.array, 0, docs, totalLength, buffer.length); - totalLength += buffer.length; - } - } - return new Buffer(docs, totalLength); - } - - private static int dedup(int[] arr, int length) { - if (length == 0) { - return 0; - } - int l = 1; - int previous = arr[0]; - for (int i = 1; i < length; ++i) { - final int value = arr[i]; - assert value >= previous; - if (value != previous) { - arr[l++] = value; - previous = value; - } - } - return l; - } - - private static boolean noDups(int[] a, int len) { - for (int i = 1; i < len; ++i) { - assert a[i - 1] < a[i]; - } - return true; - } } diff --git a/lucene/core/src/java/org/apache/lucene/util/PointsDocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/PointsDocIdSetBuilder.java new file mode 100644 index 000000000000..e267b6d95c45 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/PointsDocIdSetBuilder.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.io.IOException; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; + +/** + * A builder of {@link DocIdSet}s for {@link PointValues}. At first it uses a sparse structure to + * gather documents, and then upgrades to a non-sparse bit set once enough hits match. + * + *

To add documents, you first need to call {@link #grow} with the number of points that are to + * be visited in order to reserve space, and then call {@link BulkAdder#add(int)} on the returned + * {@link BulkAdder}. + * + *

See {@link DocIdSetBuilder} if you are not working with {@link PointValues} + * + * @lucene.internal + */ +public final class PointsDocIdSetBuilder { + + /** + * Utility class to efficiently add many docs in one go. + * + * @see PointsDocIdSetBuilder#grow + */ + public abstract static class BulkAdder { + public abstract void add(int doc); + + public void add(DocIdSetIterator iterator) throws IOException { + int docID; + while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + add(docID); + } + } + } + + private static class FixedBitSetAdder extends BulkAdder { + final FixedBitSet bitSet; + + FixedBitSetAdder(FixedBitSet bitSet) { + this.bitSet = bitSet; + } + + @Override + public void add(int doc) { + bitSet.set(doc); + } + + @Override + public void add(DocIdSetIterator iterator) throws IOException { + bitSet.or(iterator); + } + } + + private static class BufferAdder extends BulkAdder { + final Buffers buffer; + + BufferAdder(Buffers buffer) { + this.buffer = buffer; + } + + @Override + public void add(int doc) { + buffer.addDoc(doc); + } + } + + private final int maxDoc; + // pkg-private for testing + final double numValuesPerDoc; + Buffers buffers; + private FixedBitSet bitSet; + private long counter = -1; + private BulkAdder adder; + + /** + * Create a {@link PointsDocIdSetBuilder} instance that is optimized for accumulating docs that + * match the given {@link PointValues}. + */ + public PointsDocIdSetBuilder(int maxDoc, PointValues values) { + this.maxDoc = maxDoc; + final long valueCount = values.size(); + final int docCount = values.getDocCount(); + final boolean multivalued = docCount != valueCount; + if (docCount <= 0 || valueCount < 0) { + // assume one value per doc, this means the cost will be overestimated + // if the docs are actually multi-valued + this.numValuesPerDoc = 1; + } else { + // otherwise compute from index stats + this.numValuesPerDoc = (double) valueCount / docCount; + } + assert numValuesPerDoc >= 1 : "valueCount=" + valueCount + " docCount=" + docCount; + this.buffers = new Buffers(maxDoc, multivalued); + this.adder = new BufferAdder(buffers); + this.bitSet = null; + } + + /** + * Reserve space and return a {@link BulkAdder} object that can be used to visit up to {@code + * numPoints} points. + */ + public BulkAdder grow(long numPoints) { + if (bitSet == null) { + final int numDocs = (int) Math.min(Integer.MAX_VALUE, numPoints); + if (buffers.ensureBufferCapacity(numDocs) == false) { + upgradeToBitSet(); + counter += numPoints; + } + } else { + counter += numPoints; + } + return adder; + } + + private void upgradeToBitSet() { + assert bitSet == null; + FixedBitSet bitSet = new FixedBitSet(maxDoc); + this.counter = buffers.toBitSet(bitSet); + this.bitSet = bitSet; + this.buffers = null; + this.adder = new FixedBitSetAdder(bitSet); + } + + /** Build a {@link DocIdSet} from the accumulated doc IDs. */ + public DocIdSet build() { + try { + if (bitSet != null) { + assert counter >= 0; + final long cost = Math.round(counter / numValuesPerDoc); + return new BitDocIdSet(bitSet, cost); + } else { + return buffers.toDocIdSet(); + } + } finally { + this.buffers = null; + this.bitSet = null; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java index 4efda423c2ec..13141d6d2adb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestReqExclBulkScorer.java @@ -38,13 +38,11 @@ public void doTestRandom() throws IOException { DocIdSetBuilder exclBuilder = new DocIdSetBuilder(maxDoc); final int numIncludedDocs = TestUtil.nextInt(random(), 1, maxDoc); final int numExcludedDocs = TestUtil.nextInt(random(), 1, maxDoc); - DocIdSetBuilder.BulkAdder reqAdder = reqBuilder.grow(numIncludedDocs); for (int i = 0; i < numIncludedDocs; ++i) { - reqAdder.add(random().nextInt(maxDoc)); + reqBuilder.add(random().nextInt(maxDoc)); } - DocIdSetBuilder.BulkAdder exclAdder = exclBuilder.grow(numExcludedDocs); for (int i = 0; i < numExcludedDocs; ++i) { - exclAdder.add(random().nextInt(maxDoc)); + exclBuilder.add(random().nextInt(maxDoc)); } final DocIdSet req = reqBuilder.build(); diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBuffers.java b/lucene/core/src/test/org/apache/lucene/util/TestBuffers.java new file mode 100644 index 000000000000..80d5abad6809 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestBuffers.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestBuffers extends LuceneTestCase { + + public void testSingleValue() throws IOException { + int maxDoc = random().nextInt(10000) + 10000; + int currentDoc = 0; + Buffers buffers = new Buffers(maxDoc, false); + while (true) { + int batchsize = 1 + random().nextInt(10); + if (buffers.ensureBufferCapacity(batchsize) == false) { + break; + } + for (int i = 0; i < batchsize; i++) { + buffers.addDoc(currentDoc++); + } + } + FixedBitSet fixedBitSet = new FixedBitSet(maxDoc); + long counter = buffers.toBitSet(fixedBitSet); + DocIdSetIterator docIdSetIterator = buffers.toDocIdSet().iterator(); + assertEquals(counter, fixedBitSet.cardinality()); + assertEquals(currentDoc, counter); + for (int i = 0; i < currentDoc; i++) { + int doc = docIdSetIterator.nextDoc(); + assertEquals(i, doc); + assertTrue(fixedBitSet.get(doc)); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docIdSetIterator.nextDoc()); + } + + public void testMultiValue() throws IOException { + int maxDoc = random().nextInt(10000) + 10000; + int currentDoc = 0; + int totalDocs = 0; + Buffers buffers = new Buffers(maxDoc, true); + while (true) { + int batchsize = 1 + random().nextInt(10); + if (buffers.ensureBufferCapacity(batchsize) == false) { + break; + } + for (int i = 0; i < batchsize; i++) { + buffers.addDoc(currentDoc); + totalDocs++; + } + currentDoc++; + } + FixedBitSet fixedBitSet = new FixedBitSet(maxDoc); + long counter = buffers.toBitSet(fixedBitSet); + assertEquals(totalDocs, counter); + DocIdSetIterator docIdSetIterator = buffers.toDocIdSet().iterator(); + for (int i = 0; i < currentDoc; i++) { + int doc = docIdSetIterator.nextDoc(); + assertEquals(i, doc); + assertTrue(fixedBitSet.get(doc)); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docIdSetIterator.nextDoc()); + } + + public void testRandomValues() throws IOException { + int maxDoc = random().nextInt(10000) + 10000; + int totalDocs = 0; + Buffers buffers = new Buffers(maxDoc, true); + while (true) { + int batchsize = 1 + random().nextInt(10); + if (buffers.ensureBufferCapacity(batchsize) == false) { + break; + } + for (int i = 0; i < batchsize; i++) { + int doc = random().nextInt(maxDoc); + buffers.addDoc(doc); + maxDoc = Math.max(maxDoc, doc); + totalDocs++; + } + } + FixedBitSet fixedBitSet = new FixedBitSet(maxDoc + 1); + long counter = buffers.toBitSet(fixedBitSet); + DocIdSetIterator docIdSetIterator = buffers.toDocIdSet().iterator(); + assertEquals(totalDocs, counter); + int iteratedDocs = 0; + int doc; + while ((doc = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + assertTrue(fixedBitSet.get(doc)); + iteratedDocs++; + } + assertEquals(iteratedDocs, fixedBitSet.cardinality()); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java index 2fa146581c68..347faafe322a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java @@ -17,7 +17,7 @@ package org.apache.lucene.util; import java.io.IOException; -import org.apache.lucene.index.PointValues; +import java.util.Arrays; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSet; @@ -125,18 +125,22 @@ public void testRandom() throws IOException { array[k] = tmp; } - // add docs out of order DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc); for (j = 0; j < array.length; ) { final int l = TestUtil.nextInt(random(), 1, array.length - j); - DocIdSetBuilder.BulkAdder adder = null; - for (int k = 0, budget = 0; k < l; ++k) { - if (budget == 0 || rarely()) { - budget = TestUtil.nextInt(random(), 1, l - k + 5); - adder = builder.grow(budget); + if (random().nextBoolean()) { + // add docs one by one + for (int k = 0; k < l; ++k) { + builder.add(array[j++]); } - adder.add(array[j++]); - budget--; + } else { + // add docs as an iterator + int[] set = new int[l + 1]; + System.arraycopy(array, j, set, 0, l); + set[l] = DocIdSetIterator.NO_MORE_DOCS; + Arrays.sort(set); + builder.add(new IntArrayDocIdSet(set, l).iterator()); + j += l; } } @@ -148,7 +152,7 @@ public void testRandom() throws IOException { public void testMisleadingDISICost() throws IOException { final int maxDoc = TestUtil.nextInt(random(), 1000, 10000); - DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc); + DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc, new DummyTerms(0, 0)); FixedBitSet expected = new FixedBitSet(maxDoc); for (int i = 0; i < 10; ++i) { @@ -165,57 +169,23 @@ public void testMisleadingDISICost() throws IOException { assertEquals(new BitDocIdSet(expected), builder.build()); } - public void testEmptyPoints() throws IOException { - PointValues values = new DummyPointValues(0, 0); - DocIdSetBuilder builder = new DocIdSetBuilder(1, values, "foo"); + public void testEmptyTerms() throws IOException { + DummyTerms terms = new DummyTerms(0, 0); + DocIdSetBuilder builder = new DocIdSetBuilder(1, terms); assertEquals(1d, builder.numValuesPerDoc, 0d); } public void testLeverageStats() throws IOException { - // single-valued points - PointValues values = new DummyPointValues(42, 42); - DocIdSetBuilder builder = new DocIdSetBuilder(100, values, "foo"); - assertEquals(1d, builder.numValuesPerDoc, 0d); - assertFalse(builder.multivalued); - DocIdSetBuilder.BulkAdder adder = builder.grow(2); - adder.add(5); - adder.add(7); - DocIdSet set = builder.build(); - assertTrue(set instanceof BitDocIdSet); - assertEquals(2, set.iterator().cost()); - - // multi-valued points - values = new DummyPointValues(42, 63); - builder = new DocIdSetBuilder(100, values, "foo"); - assertEquals(1.5, builder.numValuesPerDoc, 0d); - assertTrue(builder.multivalued); - adder = builder.grow(2); - adder.add(5); - adder.add(7); - set = builder.build(); - assertTrue(set instanceof BitDocIdSet); - assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice - - // incomplete stats - values = new DummyPointValues(42, -1); - builder = new DocIdSetBuilder(100, values, "foo"); - assertEquals(1d, builder.numValuesPerDoc, 0d); - assertTrue(builder.multivalued); - - values = new DummyPointValues(-1, 84); - builder = new DocIdSetBuilder(100, values, "foo"); - assertEquals(1d, builder.numValuesPerDoc, 0d); - assertTrue(builder.multivalued); - // single-valued terms Terms terms = new DummyTerms(42, 42); - builder = new DocIdSetBuilder(100, terms); + DocIdSetBuilder builder = new DocIdSetBuilder(100, terms); assertEquals(1d, builder.numValuesPerDoc, 0d); - assertFalse(builder.multivalued); - adder = builder.grow(2); - adder.add(5); - adder.add(7); - set = builder.build(); + assertFalse(builder.buffers.multiValued); + FixedBitSet bitSet = new FixedBitSet(8); + bitSet.set(5); + bitSet.set(7); + builder.add(new BitSetIterator(bitSet, 2)); + DocIdSet set = builder.build(); assertTrue(set instanceof BitDocIdSet); assertEquals(2, set.iterator().cost()); @@ -223,10 +193,8 @@ public void testLeverageStats() throws IOException { terms = new DummyTerms(42, 63); builder = new DocIdSetBuilder(100, terms); assertEquals(1.5, builder.numValuesPerDoc, 0d); - assertTrue(builder.multivalued); - adder = builder.grow(2); - adder.add(5); - adder.add(7); + assertTrue(builder.buffers.multiValued); + builder.add(new BitSetIterator(bitSet, 2)); set = builder.build(); assertTrue(set instanceof BitDocIdSet); assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice @@ -235,12 +203,12 @@ public void testLeverageStats() throws IOException { terms = new DummyTerms(42, -1); builder = new DocIdSetBuilder(100, terms); assertEquals(1d, builder.numValuesPerDoc, 0d); - assertTrue(builder.multivalued); + assertTrue(builder.buffers.multiValued); terms = new DummyTerms(-1, 84); builder = new DocIdSetBuilder(100, terms); assertEquals(1d, builder.numValuesPerDoc, 0d); - assertTrue(builder.multivalued); + assertTrue(builder.buffers.multiValued); } private static class DummyTerms extends Terms { @@ -298,55 +266,4 @@ public boolean hasPayloads() { throw new UnsupportedOperationException(); } } - - private static class DummyPointValues extends PointValues { - - private final int docCount; - private final long numPoints; - - DummyPointValues(int docCount, long numPoints) { - this.docCount = docCount; - this.numPoints = numPoints; - } - - @Override - public PointTree getPointTree() { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] getMinPackedValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public byte[] getMaxPackedValue() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getNumDimensions() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getNumIndexDimensions() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int getBytesPerDimension() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long size() { - return numPoints; - } - - @Override - public int getDocCount() { - return docCount; - } - } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestPointsDocIdSetBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestPointsDocIdSetBuilder.java new file mode 100644 index 000000000000..4f7b92ca694d --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestPointsDocIdSetBuilder.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.io.IOException; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestPointsDocIdSetBuilder extends LuceneTestCase { + + public void testEmpty() throws IOException { + assertEquals( + null, + new PointsDocIdSetBuilder(1 + random().nextInt(1000), new DummyPointValues(0, 0)).build()); + } + + private void assertEquals(DocIdSet d1, DocIdSet d2) throws IOException { + if (d1 == null) { + if (d2 != null) { + assertEquals(DocIdSetIterator.NO_MORE_DOCS, d2.iterator().nextDoc()); + } + } else if (d2 == null) { + assertEquals(DocIdSetIterator.NO_MORE_DOCS, d1.iterator().nextDoc()); + } else { + DocIdSetIterator i1 = d1.iterator(); + DocIdSetIterator i2 = d2.iterator(); + for (int doc = i1.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = i1.nextDoc()) { + assertEquals(doc, i2.nextDoc()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, i2.nextDoc()); + } + } + + public void testSparse() throws IOException { + final int maxDoc = 1000000 + random().nextInt(1000000); + PointsDocIdSetBuilder builder = new PointsDocIdSetBuilder(maxDoc, new DummyPointValues(0, 0)); + final int numIterators = 1 + random().nextInt(10); + final FixedBitSet ref = new FixedBitSet(maxDoc); + for (int i = 0; i < numIterators; ++i) { + final int baseInc = 200000 + random().nextInt(10000); + RoaringDocIdSet.Builder b = new RoaringDocIdSet.Builder(maxDoc); + for (int doc = random().nextInt(100); + doc < maxDoc; + doc += baseInc + random().nextInt(10000)) { + b.add(doc); + ref.set(doc); + } + DocIdSetIterator iterator = b.build().iterator(); + builder.grow(iterator.cost()).add(iterator); + } + DocIdSet result = builder.build(); + assertTrue(result instanceof IntArrayDocIdSet); + assertEquals(new BitDocIdSet(ref), result); + } + + public void testDense() throws IOException { + final int maxDoc = 1000000 + random().nextInt(1000000); + PointsDocIdSetBuilder builder = new PointsDocIdSetBuilder(maxDoc, new DummyPointValues(0, 0)); + final int numIterators = 1 + random().nextInt(10); + final FixedBitSet ref = new FixedBitSet(maxDoc); + for (int i = 0; i < numIterators; ++i) { + RoaringDocIdSet.Builder b = new RoaringDocIdSet.Builder(maxDoc); + for (int doc = random().nextInt(1000); doc < maxDoc; doc += 1 + random().nextInt(100)) { + b.add(doc); + ref.set(doc); + } + DocIdSetIterator iterator = b.build().iterator(); + builder.grow(iterator.cost()).add(iterator); + } + DocIdSet result = builder.build(); + assertTrue(result instanceof BitDocIdSet); + assertEquals(new BitDocIdSet(ref), result); + } + + public void testRandom() throws IOException { + final int maxDoc = + TEST_NIGHTLY + ? TestUtil.nextInt(random(), 1, 10000000) + : TestUtil.nextInt(random(), 1, 100000); + for (int i = 1; i < maxDoc / 2; i <<= 1) { + final int numDocs = TestUtil.nextInt(random(), 1, i); + final FixedBitSet docs = new FixedBitSet(maxDoc); + int c = 0; + while (c < numDocs) { + final int d = random().nextInt(maxDoc); + if (docs.get(d) == false) { + docs.set(d); + c += 1; + } + } + + final int[] array = new int[numDocs + random().nextInt(100)]; + DocIdSetIterator it = new BitSetIterator(docs, 0L); + int j = 0; + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + array[j++] = doc; + } + assertEquals(numDocs, j); + + // add some duplicates + while (j < array.length) { + array[j++] = array[random().nextInt(numDocs)]; + } + + // shuffle + for (j = array.length - 1; j >= 1; --j) { + final int k = random().nextInt(j); + int tmp = array[j]; + array[j] = array[k]; + array[k] = tmp; + } + + // add docs out of order + PointsDocIdSetBuilder builder = + new PointsDocIdSetBuilder(maxDoc, new DummyPointValues(maxDoc, maxDoc + 1)); + for (j = 0; j < array.length; ) { + final int l = TestUtil.nextInt(random(), 1, array.length - j); + PointsDocIdSetBuilder.BulkAdder adder = null; + for (int k = 0, budget = 0; k < l; ++k) { + if (budget == 0 || rarely()) { + budget = TestUtil.nextInt(random(), 1, l - k + 5); + adder = builder.grow(budget); + } + adder.add(array[j++]); + budget--; + } + } + + final DocIdSet expected = new BitDocIdSet(docs); + final DocIdSet actual = builder.build(); + assertEquals(expected, actual); + } + } + + public void testEmptyPoints() throws IOException { + PointValues values = new DummyPointValues(0, 0); + PointsDocIdSetBuilder builder = new PointsDocIdSetBuilder(1, values); + assertEquals(1d, builder.numValuesPerDoc, 0d); + } + + public void testLeverageStats() throws IOException { + // single-valued points + PointValues values = new DummyPointValues(42, 42); + PointsDocIdSetBuilder builder = new PointsDocIdSetBuilder(100, values); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertFalse(builder.buffers.multiValued); + PointsDocIdSetBuilder.BulkAdder adder = builder.grow(2); + adder.add(5); + adder.add(7); + DocIdSet set = builder.build(); + assertTrue(set instanceof BitDocIdSet); + assertEquals(2, set.iterator().cost()); + + // multi-valued points + values = new DummyPointValues(42, 63); + builder = new PointsDocIdSetBuilder(100, values); + assertEquals(1.5, builder.numValuesPerDoc, 0d); + assertTrue(builder.buffers.multiValued); + adder = builder.grow(2); + adder.add(5); + adder.add(7); + set = builder.build(); + assertTrue(set instanceof BitDocIdSet); + assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice + + // incomplete stats + values = new DummyPointValues(42, -1); + builder = new PointsDocIdSetBuilder(100, values); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertTrue(builder.buffers.multiValued); + + values = new DummyPointValues(-1, 84); + builder = new PointsDocIdSetBuilder(100, values); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertTrue(builder.buffers.multiValued); + } + + private static class DummyPointValues extends PointValues { + + private final int docCount; + private final long numPoints; + + DummyPointValues(int docCount, long numPoints) { + this.docCount = docCount; + this.numPoints = numPoints; + } + + @Override + public PointTree getPointTree() { + throw new UnsupportedOperationException(); + } + + @Override + public byte[] getMinPackedValue() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public byte[] getMaxPackedValue() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getNumDimensions() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getNumIndexDimensions() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getBytesPerDimension() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long size() { + return numPoints; + } + + @Override + public int getDocCount() { + return docCount; + } + } +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java index 93ef7e87fc27..0f5e7ca5cb3e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetsCollector.java @@ -115,7 +115,7 @@ public List getMatchingDocs() { @Override public final void collect(int doc) throws IOException { - docsBuilder.grow(1).add(doc); + docsBuilder.add(doc); if (keepScores) { if (totalHits >= scores.length) { float[] newScores = new float[ArrayUtil.oversize(totalHits + 1, 4)]; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiRangeQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiRangeQuery.java index ae07d34d706d..bc5eab04c5dc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiRangeQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiRangeQuery.java @@ -37,7 +37,7 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PointsDocIdSetBuilder; /** * Abstract class for range queries involving multiple ranges against physical points such as {@code @@ -178,10 +178,10 @@ public final Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, fl return new ConstantScoreWeight(this, boost) { private PointValues.IntersectVisitor getIntersectVisitor( - DocIdSetBuilder result, Relatable range) { + PointsDocIdSetBuilder result, Relatable range) { return new PointValues.IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; + PointsDocIdSetBuilder.BulkAdder adder; @Override public void grow(int count) { @@ -277,7 +277,7 @@ public long cost() { } else { return new ScorerSupplier() { - final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); + final PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(reader.maxDoc(), values); final PointValues.IntersectVisitor visitor = getIntersectVisitor(result, range); long cost = -1; diff --git a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInGeo3DShapeQuery.java b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInGeo3DShapeQuery.java index 283cb5144105..912fed153ec0 100644 --- a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInGeo3DShapeQuery.java +++ b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInGeo3DShapeQuery.java @@ -31,7 +31,7 @@ import org.apache.lucene.spatial3d.geom.GeoShape; import org.apache.lucene.spatial3d.geom.XYZBounds; import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PointsDocIdSetBuilder; import org.apache.lucene.util.RamUsageEstimator; /** @@ -103,7 +103,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { assert xyzSolid.getRelationship(shape) == GeoArea.WITHIN || xyzSolid.getRelationship(shape) == GeoArea.OVERLAPS: "expected WITHIN (1) or OVERLAPS (2) but got " + xyzSolid.getRelationship(shape) + "; shape="+shape+"; XYZSolid="+xyzSolid; */ - DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); + PointsDocIdSetBuilder result = new PointsDocIdSetBuilder(reader.maxDoc(), values); values.intersect(new PointInShapeIntersectVisitor(result, shape, shapeBounds)); diff --git a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java index 8883fef22409..404f87b0a5c6 100644 --- a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java +++ b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java @@ -26,11 +26,11 @@ import org.apache.lucene.spatial3d.geom.GeoShape; import org.apache.lucene.spatial3d.geom.PlanetModel.DocValueEncoder; import org.apache.lucene.spatial3d.geom.XYZBounds; -import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.PointsDocIdSetBuilder; class PointInShapeIntersectVisitor implements IntersectVisitor { - private final DocIdSetBuilder hits; + private final PointsDocIdSetBuilder hits; private final GeoShape shape; private final double minimumX; private final double maximumX; @@ -38,9 +38,10 @@ class PointInShapeIntersectVisitor implements IntersectVisitor { private final double maximumY; private final double minimumZ; private final double maximumZ; - private DocIdSetBuilder.BulkAdder adder; + private PointsDocIdSetBuilder.BulkAdder adder; - public PointInShapeIntersectVisitor(DocIdSetBuilder hits, GeoShape shape, XYZBounds bounds) { + public PointInShapeIntersectVisitor( + PointsDocIdSetBuilder hits, GeoShape shape, XYZBounds bounds) { this.hits = hits; this.shape = shape; DocValueEncoder docValueEncoder = shape.getPlanetModel().getDocValueEncoder(); diff --git a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java index 923181ef3b27..0d33411a14ea 100644 --- a/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java +++ b/lucene/spatial3d/src/test/org/apache/lucene/spatial3d/TestGeo3DPoint.java @@ -47,6 +47,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.index.ReaderUtil; @@ -77,10 +78,10 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; -import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.PointsDocIdSetBuilder; public class TestGeo3DPoint extends LuceneTestCase { @@ -1954,11 +1955,12 @@ public static String explain( // First find the leaf reader that owns this doc: int subIndex = ReaderUtil.subIndex(docID, reader.leaves()); LeafReader leafReader = reader.leaves().get(subIndex).reader(); + PointValues pointValues = leafReader.getPointValues(fieldName); StringBuilder b = new StringBuilder(); b.append("target is in leaf " + leafReader + " of full reader " + reader + "\n"); - DocIdSetBuilder hits = new DocIdSetBuilder(leafReader.maxDoc()); + PointsDocIdSetBuilder hits = new PointsDocIdSetBuilder(leafReader.maxDoc(), pointValues); ExplainingVisitor visitor = new ExplainingVisitor( shape, @@ -1975,7 +1977,7 @@ public static String explain( // Do second phase, where we we see how the wrapped visitor responded along that path: visitor.startSecondPhase(); - leafReader.getPointValues(fieldName).intersect(visitor); + pointValues.intersect(visitor); return b.toString(); }