Skip to content

Commit a0b0121

Browse files
ajleong623Peter Alfonsi
authored andcommitted
Rare terms aggregation precomputation (opensearch-project#18978)
--------- Signed-off-by: Anthony Leong <aj.leong623@gmail.com>
1 parent 2c4d7c4 commit a0b0121

File tree

8 files changed

+475
-9
lines changed

8 files changed

+475
-9
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3434
- Implement GRPC MatchPhrase, MultiMatch queries ([#19449](https://github.com/opensearch-project/OpenSearch/pull/19449))
3535
- Optimize gRPC transport thread management for improved throughput ([#19278](https://github.com/opensearch-project/OpenSearch/pull/19278))
3636
- Implement GRPC Boolean query and inject registry for all internal query converters ([#19391](https://github.com/opensearch-project/OpenSearch/pull/19391))
37+
- Added precomputation for rare terms aggregation ([##18978](https://github.com/opensearch-project/OpenSearch/pull/18978))
3738
- Implement GRPC Script query ([#19455](https://github.com/opensearch-project/OpenSearch/pull/19455))
3839
- [Search Stats] Add search & star-tree search query failure count metrics ([#19210](https://github.com/opensearch-project/OpenSearch/issues/19210))
3940
- [Star-tree] Support for multi-terms aggregation ([#18398](https://github.com/opensearch-project/OpenSearch/issues/18398))

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ public Aggregator build(
9797
SearchContext context,
9898
Aggregator parent,
9999
CardinalityUpperBound cardinality,
100-
Map<String, Object> metadata
100+
Map<String, Object> metadata,
101+
ValuesSourceConfig config
101102
) throws IOException {
102103

103104
ExecutionMode execution = ExecutionMode.MAP; // TODO global ords not implemented yet, only supports "map"
@@ -123,7 +124,8 @@ public Aggregator build(
123124
metadata,
124125
maxDocCount,
125126
precision,
126-
cardinality
127+
cardinality,
128+
config
127129
);
128130

129131
}
@@ -148,7 +150,8 @@ public Aggregator build(
148150
SearchContext context,
149151
Aggregator parent,
150152
CardinalityUpperBound cardinality,
151-
Map<String, Object> metadata
153+
Map<String, Object> metadata,
154+
ValuesSourceConfig config
152155
) throws IOException {
153156

154157
if ((includeExclude != null) && (includeExclude.isRegexBased())) {
@@ -233,7 +236,8 @@ protected Aggregator doCreateInternal(
233236
searchContext,
234237
parent,
235238
cardinality,
236-
metadata
239+
metadata,
240+
config
237241
);
238242
}
239243

@@ -263,7 +267,8 @@ Aggregator create(
263267
Map<String, Object> metadata,
264268
long maxDocCount,
265269
double precision,
266-
CardinalityUpperBound cardinality
270+
CardinalityUpperBound cardinality,
271+
ValuesSourceConfig config
267272
) throws IOException {
268273
int maxRegexLength = context.getQueryShardContext().getIndexSettings().getMaxRegexLength();
269274
final IncludeExclude.StringFilter filter = includeExclude == null
@@ -280,7 +285,8 @@ Aggregator create(
280285
metadata,
281286
maxDocCount,
282287
precision,
283-
cardinality
288+
cardinality,
289+
config
284290
);
285291
}
286292

@@ -317,7 +323,8 @@ abstract Aggregator create(
317323
Map<String, Object> metadata,
318324
long maxDocCount,
319325
double precision,
320-
CardinalityUpperBound cardinality
326+
CardinalityUpperBound cardinality,
327+
ValuesSourceConfig config
321328
) throws IOException;
322329

323330
abstract boolean needsGlobalOrdinals();

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/RareTermsAggregatorSupplier.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.opensearch.search.aggregations.AggregatorFactories;
3737
import org.opensearch.search.aggregations.CardinalityUpperBound;
3838
import org.opensearch.search.aggregations.support.ValuesSource;
39+
import org.opensearch.search.aggregations.support.ValuesSourceConfig;
3940
import org.opensearch.search.internal.SearchContext;
4041

4142
import java.io.IOException;
@@ -58,6 +59,7 @@ Aggregator build(
5859
SearchContext context,
5960
Aggregator parent,
6061
CardinalityUpperBound carinality,
61-
Map<String, Object> metadata
62+
Map<String, Object> metadata,
63+
ValuesSourceConfig config
6264
) throws IOException;
6365
}

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,19 @@
3131

3232
package org.opensearch.search.aggregations.bucket.terms;
3333

34+
import org.apache.lucene.index.DocValues;
3435
import org.apache.lucene.index.LeafReaderContext;
36+
import org.apache.lucene.index.NumericDocValues;
37+
import org.apache.lucene.index.Terms;
38+
import org.apache.lucene.index.TermsEnum;
39+
import org.apache.lucene.search.Weight;
3540
import org.apache.lucene.util.BytesRef;
3641
import org.apache.lucene.util.BytesRefBuilder;
3742
import org.opensearch.common.lease.Releasables;
3843
import org.opensearch.common.util.BytesRefHash;
3944
import org.opensearch.common.util.SetBackedScalingCuckooFilter;
4045
import org.opensearch.index.fielddata.SortedBinaryDocValues;
46+
import org.opensearch.index.mapper.DocCountFieldMapper;
4147
import org.opensearch.search.DocValueFormat;
4248
import org.opensearch.search.aggregations.Aggregator;
4349
import org.opensearch.search.aggregations.AggregatorFactories;
@@ -46,6 +52,7 @@
4652
import org.opensearch.search.aggregations.LeafBucketCollector;
4753
import org.opensearch.search.aggregations.LeafBucketCollectorBase;
4854
import org.opensearch.search.aggregations.support.ValuesSource;
55+
import org.opensearch.search.aggregations.support.ValuesSourceConfig;
4956
import org.opensearch.search.internal.SearchContext;
5057

5158
import java.io.IOException;
@@ -55,6 +62,7 @@
5562
import java.util.Map;
5663

5764
import static java.util.Collections.emptyList;
65+
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
5866

5967
/**
6068
* An aggregator that finds "rare" string values (e.g. terms agg that orders ascending)
@@ -64,7 +72,10 @@
6472
public class StringRareTermsAggregator extends AbstractRareTermsAggregator {
6573
private final ValuesSource.Bytes valuesSource;
6674
private final IncludeExclude.StringFilter filter;
75+
private Weight weight;
6776
private final BytesKeyedBucketOrds bucketOrds;
77+
protected final String fieldName;
78+
private final ValuesSourceConfig config;
6879

6980
StringRareTermsAggregator(
7081
String name,
@@ -77,12 +88,19 @@ public class StringRareTermsAggregator extends AbstractRareTermsAggregator {
7788
Map<String, Object> metadata,
7889
long maxDocCount,
7990
double precision,
80-
CardinalityUpperBound cardinality
91+
CardinalityUpperBound cardinality,
92+
ValuesSourceConfig config
8193
) throws IOException {
8294
super(name, factories, context, parent, metadata, maxDocCount, precision, format);
8395
this.valuesSource = valuesSource;
8496
this.filter = filter;
8597
this.bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
98+
this.fieldName = valuesSource.getIndexFieldName();
99+
this.config = config;
100+
}
101+
102+
public void setWeight(Weight weight) {
103+
this.weight = weight;
86104
}
87105

88106
@Override
@@ -122,6 +140,68 @@ public void collect(int docId, long owningBucketOrd) throws IOException {
122140
};
123141
}
124142

143+
@Override
144+
protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
145+
if (weight == null) {
146+
return false;
147+
} else {
148+
// The optimization could only be used if there are no deleted documents and the top-level
149+
// query matches all documents in the segment.
150+
if (weight.count(ctx) == 0) {
151+
return true;
152+
} else if (weight.count(ctx) != ctx.reader().maxDoc()) {
153+
return false;
154+
}
155+
}
156+
157+
if (subAggregators.length > 0) {
158+
// The optimization does not work when there are subaggregations.
159+
// The query has to be a match all, otherwise
160+
return false;
161+
}
162+
163+
Terms stringTerms = ctx.reader().terms(fieldName);
164+
if (stringTerms == null) {
165+
// Field is not indexed.
166+
return false;
167+
}
168+
169+
NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
170+
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
171+
// This segment has at least one document with the _doc_count field.
172+
return false;
173+
}
174+
175+
TermsEnum stringTermsEnum = stringTerms.iterator();
176+
BytesRef stringTerm = stringTermsEnum.next();
177+
178+
// Here, we are accounting for the case that there might be missing values for the field name
179+
if (config != null && config.missing() != null) {
180+
String missingField = (String) config.missing();
181+
BytesRef missingFieldTerm = new BytesRef(missingField);
182+
int missingCount = weight.count(ctx) - ctx.reader().getDocCount(fieldName);
183+
if (missingCount > 0) {
184+
// Since the bucket name for the missing documents is not indexed as a potential value for that field,
185+
// We will not have to worry about adding to a bucket that was already seen.
186+
long bucketOrdinal = bucketOrds.add(0L, missingFieldTerm);
187+
incrementBucketDocCount(bucketOrdinal, missingCount);
188+
}
189+
}
190+
191+
// Here, we will iterate over all the terms in the segment and add the counts into the bucket.
192+
while (stringTerm != null) {
193+
if (filter == null || filter.accept(stringTerm)) {
194+
long bucketOrdinal = bucketOrds.add(0L, stringTerm);
195+
if (bucketOrdinal < 0) { // already seen
196+
bucketOrdinal = -1 - bucketOrdinal;
197+
}
198+
incrementBucketDocCount(bucketOrdinal, stringTermsEnum.docFreq());
199+
}
200+
stringTerm = stringTermsEnum.next();
201+
}
202+
return true;
203+
}
204+
125205
@Override
126206
public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
127207
/*

server/src/main/java/org/opensearch/search/aggregations/support/MissingValues.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,11 @@ public String toString() {
269269
return "anon ValuesSource.Bytes.WithOrdinals of [" + super.toString() + "]";
270270
}
271271

272+
@Override
273+
public String getIndexFieldName() {
274+
return valuesSource.getIndexFieldName();
275+
}
276+
272277
};
273278
}
274279

server/src/main/java/org/opensearch/search/aggregations/support/ValuesSource.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ public boolean hasGlobalOrdinals() {
113113
return false;
114114
}
115115

116+
public String getIndexFieldName() {
117+
return null;
118+
}
119+
116120
/**
117121
* Range type
118122
*
@@ -249,6 +253,7 @@ public FieldData(IndexOrdinalsFieldData indexFieldData) {
249253
this.indexFieldData = indexFieldData;
250254
}
251255

256+
@Override
252257
public String getIndexFieldName() {
253258
return this.indexFieldData.getFieldName();
254259
}
@@ -309,6 +314,11 @@ public SortedBinaryDocValues bytesValues(LeafReaderContext context) {
309314
return indexFieldData.load(context).getBytesValues();
310315
}
311316

317+
@Override
318+
public String getIndexFieldName() {
319+
return this.indexFieldData.getFieldName();
320+
}
321+
312322
}
313323

314324
/**
@@ -631,6 +641,7 @@ public SortedNumericDoubleValues doubleValues(LeafReaderContext context) {
631641
return indexFieldData.load(context).getDoubleValues();
632642
}
633643

644+
@Override
634645
public String getIndexFieldName() {
635646
return indexFieldData.getFieldName();
636647
}

0 commit comments

Comments
 (0)