Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,24 @@ setup:
- is_false: aggregations.str_terms.buckets.1.key_as_string
- match: { aggregations.str_terms.buckets.1.doc_count: 1 }

---
"Global String Value Script with doc notation":

- do:
search:
rest_total_hits_as_int: true
body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str","execution_hint": "global_ordinals", "script": { "source": "return doc.str[0] + \"1\""} } } } }

- match: { hits.total: 3 }

- length: { aggregations.str_terms.buckets: 2 }
- match: { aggregations.str_terms.buckets.0.key: "abc1" }
- is_false: aggregations.str_terms.buckets.0.key_as_string
- match: { aggregations.str_terms.buckets.0.doc_count: 2 }
- match: { aggregations.str_terms.buckets.1.key: "bcd1" }
- is_false: aggregations.str_terms.buckets.1.key_as_string
- match: { aggregations.str_terms.buckets.1.doc_count: 1 }

---
"Long Value Script with doc notation":

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,13 @@

package org.opensearch.search.aggregations.bucket.missing;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.Weight;
import org.opensearch.index.fielddata.DocValueBits;
import org.opensearch.index.mapper.DocCountFieldMapper;
import org.opensearch.search.aggregations.Aggregator;
import org.opensearch.search.aggregations.AggregatorFactories;
import org.opensearch.search.aggregations.CardinalityUpperBound;
Expand All @@ -46,7 +51,11 @@
import org.opensearch.search.internal.SearchContext;

import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;

/**
* Aggregate all docs that are missing a value.
Expand All @@ -55,7 +64,10 @@
*/
public class MissingAggregator extends BucketsAggregator implements SingleBucketAggregator {

private Weight weight;
private final ValuesSource valuesSource;
protected final String fieldName;
private final ValuesSourceConfig valuesSourceConfig;

public MissingAggregator(
String name,
Expand All @@ -69,6 +81,16 @@
super(name, factories, aggregationContext, parent, cardinality, metadata);
// TODO: Stop using nulls here
this.valuesSource = valuesSourceConfig.hasValues() ? valuesSourceConfig.getValuesSource() : null;
if (this.valuesSource != null) {
this.fieldName = valuesSource.getIndexFieldName();
} else {
this.fieldName = null;
}
this.valuesSourceConfig = valuesSourceConfig;
}

public void setWeight(Weight weight) {
this.weight = weight;
}

@Override
Expand All @@ -94,6 +116,58 @@
};
}

@Override
protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
// The optimization does not work when there are subaggregations.
if (subAggregators.length > 0) {
return false;
}

// When fieldname does not exist, we cannot collect through the precomputation.
if (fieldName == null || weight == null) {
return false;
}

// we do not collect any documents through the missing aggregation when the missing parameter
// is up.
if (valuesSourceConfig != null && valuesSourceConfig.missing() != null) {
return true;
}

// The optimization could only be used if there are no deleted documents and the top-level
// query matches all documents in the segment.
if (weight.count(ctx) == 0) {
return true;

Check warning on line 140 in server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java#L140

Added line #L140 was not covered by tests
} else if (weight.count(ctx) != ctx.reader().maxDoc()) {
return false;

Check warning on line 142 in server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/missing/MissingAggregator.java#L142

Added line #L142 was not covered by tests
}

Set<String> indexedFields = new HashSet<>(FieldInfos.getIndexedFields(ctx.reader()));

// This will only work if the field name is indexed because otherwise, the reader would not
// have kept track of the doc count of the fieldname. There is a case where a field might be nonexistent
// but still can be calculated.
if (indexedFields.contains(fieldName) == false && ctx.reader().getFieldInfos().fieldInfo(fieldName) != null) {
return false;
}

NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
// This segment has at least one document with the _doc_count field.
return false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if you separate out the test cases as I commented in test files - that can give you a good code coverage as well.

}

long docCountWithFieldName = ctx.reader().getDocCount(fieldName);
int totalDocCount = ctx.reader().maxDoc();

// The missing aggregation bucket will count the number of documents where the field name is
// either null or not present in that document. We are subtracting the documents where the field
// value is valid.
incrementBucketDocCount(0, totalDocCount - docCountWithFieldName);

return true;
}

@Override
public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
return buildAggregationsForSingleBucket(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.opensearch.search.aggregations.support.ValuesSource;
import org.opensearch.search.aggregations.support.ValuesSourceConfig;
import org.opensearch.search.internal.SearchContext;
import org.opensearch.search.startree.StarTreeQueryHelper;
import org.opensearch.search.startree.filter.DimensionFilter;
Expand Down Expand Up @@ -107,6 +108,18 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
private final SetOnce<SortedSetDocValues> dvs = new SetOnce<>();
protected int segmentsWithSingleValuedOrds = 0;
protected int segmentsWithMultiValuedOrds = 0;
LongUnaryOperator globalOperator;
private final ValuesSourceConfig config;

/**
* Lookup global ordinals
*
* @opensearch.internal
*/
public interface GlobalOrdLookupFunction {
BytesRef apply(long ord) throws IOException;
}

protected CardinalityUpperBound cardinalityUpperBound;

public GlobalOrdinalsStringTermsAggregator(
Expand All @@ -124,7 +137,8 @@ public GlobalOrdinalsStringTermsAggregator(
SubAggCollectionMode collectionMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map<String, Object> metadata
Map<String, Object> metadata,
ValuesSourceConfig config
) throws IOException {
super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
this.cardinalityUpperBound = cardinality;
Expand All @@ -146,9 +160,8 @@ public GlobalOrdinalsStringTermsAggregator(
return new DenseGlobalOrds();
});
}
this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData)
? ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName()
: null;
this.fieldName = valuesSource.getIndexFieldName();
this.config = config;
}

String descriptCollectionStrategy() {
Expand Down Expand Up @@ -185,6 +198,14 @@ boolean tryCollectFromTermFrequencies(LeafReaderContext ctx, BiConsumer<Long, In
}
}

// If the missing property is specified in the builder, and there are documents with the
// field missing, we might not be able to use the index unless there is a way to
// calculate which ordinal value that missing field is (something I am not sure how to
// do yet).
if (config != null && config.missing() != null && ((weight.count(ctx) == ctx.reader().getDocCount(fieldName)) == false)) {
return false;
}

Terms segmentTerms = ctx.reader().terms(this.fieldName);
if (segmentTerms == null) {
// Field is not indexed.
Expand Down Expand Up @@ -482,7 +503,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
boolean remapGlobalOrds,
SubAggCollectionMode collectionMode,
boolean showTermDocCountError,
Map<String, Object> metadata
Map<String, Object> metadata,
ValuesSourceConfig config
) throws IOException {
super(
name,
Expand All @@ -499,7 +521,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
collectionMode,
showTermDocCountError,
CardinalityUpperBound.ONE,
metadata
metadata,
config
);
assert factories == null || factories.countAggregators() == 0;
this.segmentDocCounts = context.bigArrays().newLongArray(1, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,21 @@

package org.opensearch.search.aggregations.bucket.terms;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.PriorityQueue;
import org.opensearch.common.lease.Releasable;
import org.opensearch.common.lease.Releasables;
import org.opensearch.common.util.LongArray;
import org.opensearch.index.fielddata.SortedBinaryDocValues;
import org.opensearch.index.mapper.DocCountFieldMapper;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.aggregations.Aggregator;
import org.opensearch.search.aggregations.AggregatorFactories;
Expand All @@ -54,6 +60,7 @@
import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.opensearch.search.aggregations.support.ValuesSource;
import org.opensearch.search.aggregations.support.ValuesSourceConfig;
import org.opensearch.search.internal.SearchContext;

import java.io.IOException;
Expand All @@ -65,6 +72,8 @@
import java.util.function.Supplier;

import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

/**
* An aggregator of string values that hashes the strings on the fly rather
Expand All @@ -75,8 +84,11 @@
public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
private final CollectorSource collectorSource;
private final ResultStrategy<?, ?> resultStrategy;
private Weight weight;
private final BytesKeyedBucketOrds bucketOrds;
private final IncludeExclude.StringFilter includeExclude;
protected final String fieldName;
private final ValuesSourceConfig config;

public MapStringTermsAggregator(
String name,
Expand All @@ -92,13 +104,25 @@
SubAggCollectionMode collectionMode,
boolean showTermDocCountError,
CardinalityUpperBound cardinality,
Map<String, Object> metadata
Map<String, Object> metadata,
ValuesSourceConfig config
) throws IOException {
super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
this.collectorSource = collectorSource;
this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
this.includeExclude = includeExclude;
bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
if (collectorSource instanceof ValuesSourceCollectorSource) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the idea of being uncertain about where the fieldName is going to come from, basically either from constructor above or fetching from value source. Let's be deterministic on where we are going to fetch the field name.


Also, you can probably use pattern matching for instanceof:

if (collectorSource instanceof ValuesSourceCollectorSource valuesCollectorSource) {
    this.fieldName = valuesCollectorSource.getValuesSource().getIndexFieldName();
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I will just stick with fetching from the value source. Since I made the modification to add the field name to the constructor, previous implementations should not be affected.

ValuesSource valuesCollectorSource = ((ValuesSourceCollectorSource) collectorSource).getValuesSource();
this.fieldName = valuesCollectorSource.getIndexFieldName();
} else {
this.fieldName = null;
}
this.config = config;
}

public void setWeight(Weight weight) {
this.weight = weight;
}

@Override
Expand Down Expand Up @@ -130,6 +154,69 @@
);
}

@Override
protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
// TODO: A note is that in scripted aggregations, the way of collecting from buckets is determined from
// the script aggregator. For now, we will not be able to support the script aggregation.

// The optimization does not work when there are subaggregations or if there is a filter.
// The query has to be a match all, otherwise
if (subAggregators.length > 0 || includeExclude != null || fieldName == null || weight == null) {
return false;
}

// The optimization could only be used if there are no deleted documents and the top-level
// query matches all documents in the segment.
if (weight.count(ctx) == 0) {
return true;
} else if (weight.count(ctx) != ctx.reader().maxDoc()) {
return false;
}

// If the missing property is specified in the builder, and there are documents with the
// field missing, we might not be able to use the index unless there is some way we can
// calculate which ordinal value that missing field is (something I am not sure how to
// do yet).
// Custom scripts cannot be supported because when the aggregation is returned, parts of the custom
// script are not included. See test 'org.opensearch.painless.\
// LangPainlessClientYamlTestSuiteIT.test {yaml=painless/100_terms_agg/String Value Script with doc notation}'
// for more details on why it cannot be supported.
if ((config != null)
&& ((config.missing() != null && ((weight.count(ctx) != ctx.reader().getDocCount(fieldName)))) || (config.script() != null))) {
return false;
}

Terms stringTerms = ctx.reader().terms(fieldName);
if (stringTerms == null) {
// Field is not indexed.
return false;
}

NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
// This segment has at least one document with the _doc_count field.
return false;
}

TermsEnum stringTermsEnum = stringTerms.iterator();
BytesRef stringTerm = stringTermsEnum.next();

// Here, we will iterate over all the terms in the segment and add the counts into the bucket.
while (stringTerm != null) {
long bucketOrdinal = bucketOrds.add(0L, stringTerm);
if (bucketOrdinal < 0) { // already seen
bucketOrdinal = -1 - bucketOrdinal;

Check warning on line 208 in server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java#L208

Added line #L208 was not covered by tests
}
int docCount = stringTermsEnum.docFreq();
if (resultStrategy instanceof SignificantTermsResults sigTermsResultStrategy) {
sigTermsResultStrategy.updateSubsetSizes(0L, docCount);

Check warning on line 212 in server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java#L212

Added line #L212 was not covered by tests
}
incrementBucketDocCount(bucketOrdinal, docCount);
stringTerm = stringTermsEnum.next();
}
return true;
}

@Override
public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
return resultStrategy.buildAggregations(owningBucketOrds);
Expand Down Expand Up @@ -196,6 +283,10 @@
return valuesSource.needsScores();
}

public ValuesSource getValuesSource() {
return valuesSource;
}

@Override
public LeafBucketCollector getLeafCollector(
IncludeExclude.StringFilter includeExclude,
Expand Down Expand Up @@ -502,6 +593,11 @@
return "significant_terms";
}

public void updateSubsetSizes(long owningBucketOrd, int amount) {
subsetSizes = context.bigArrays().grow(subsetSizes, owningBucketOrd + 1);
subsetSizes.increment(owningBucketOrd, amount);
}

Check warning on line 599 in server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java#L597-L599

Added lines #L597 - L599 were not covered by tests

@Override
LeafBucketCollector wrapCollector(LeafBucketCollector primary) {
return new LeafBucketCollectorBase(primary, null) {
Expand Down
Loading
Loading