opensearch-project · ajleong623 · Jun 1, 2025 · Jun 1, 2025 · Jun 26, 2025 · Jun 26, 2025
@@ -66,6 +66,24 @@ setup:
   - is_false: aggregations.str_terms.buckets.1.key_as_string
   - match: { aggregations.str_terms.buckets.1.doc_count: 1 }
 
+---
+"Global String Value Script with doc notation":
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str","execution_hint": "global_ordinals", "script": { "source": "return doc.str[0] + \"1\""} } } } }
+
+  - match: { hits.total: 3 }
+
+  - length: { aggregations.str_terms.buckets: 2 }
+  - match: { aggregations.str_terms.buckets.0.key: "abc1" }
+  - is_false: aggregations.str_terms.buckets.0.key_as_string
+  - match: { aggregations.str_terms.buckets.0.doc_count: 2 }
+  - match: { aggregations.str_terms.buckets.1.key: "bcd1" }
+  - is_false: aggregations.str_terms.buckets.1.key_as_string
+  - match: { aggregations.str_terms.buckets.1.doc_count: 1 }
+
 ---
 "Long Value Script with doc notation":
 

@@ -31,8 +31,13 @@
 
 package org.opensearch.search.aggregations.bucket.missing;
 
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.Weight;
 import org.opensearch.index.fielddata.DocValueBits;
+import org.opensearch.index.mapper.DocCountFieldMapper;
 import org.opensearch.search.aggregations.Aggregator;
 import org.opensearch.search.aggregations.AggregatorFactories;
 import org.opensearch.search.aggregations.CardinalityUpperBound;
@@ -46,7 +51,11 @@
 import org.opensearch.search.internal.SearchContext;
 
 import java.io.IOException;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
+
+import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;
 
 /**
  * Aggregate all docs that are missing a value.
@@ -55,7 +64,10 @@
  */
 public class MissingAggregator extends BucketsAggregator implements SingleBucketAggregator {
 
+    private Weight weight;
     private final ValuesSource valuesSource;
+    protected final String fieldName;
+    private final ValuesSourceConfig valuesSourceConfig;
 
     public MissingAggregator(
         String name,
@@ -69,6 +81,16 @@
         super(name, factories, aggregationContext, parent, cardinality, metadata);
         // TODO: Stop using nulls here
         this.valuesSource = valuesSourceConfig.hasValues() ? valuesSourceConfig.getValuesSource() : null;
+        if (this.valuesSource != null) {
+            this.fieldName = valuesSource.getIndexFieldName();
+        } else {
+            this.fieldName = null;
+        }
+        this.valuesSourceConfig = valuesSourceConfig;
+    }
+
+    public void setWeight(Weight weight) {
+        this.weight = weight;
     }
 
     @Override
@@ -94,6 +116,58 @@
         };
     }
 
+    @Override
+    protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
+        // The optimization does not work when there are subaggregations.
+        if (subAggregators.length > 0) {
+            return false;
+        }
+
+        // When fieldname does not exist, we cannot collect through the precomputation.
+        if (fieldName == null || weight == null) {
+            return false;
+        }
+
+        // we do not collect any documents through the missing aggregation when the missing parameter
+        // is up.
+        if (valuesSourceConfig != null && valuesSourceConfig.missing() != null) {
+            return true;
+        }
+
+        // The optimization could only be used if there are no deleted documents and the top-level
+        // query matches all documents in the segment.
+        if (weight.count(ctx) == 0) {
+            return true;
+        } else if (weight.count(ctx) != ctx.reader().maxDoc()) {
+            return false;
+        }
+
+        Set<String> indexedFields = new HashSet<>(FieldInfos.getIndexedFields(ctx.reader()));
+
+        // This will only work if the field name is indexed because otherwise, the reader would not
+        // have kept track of the doc count of the fieldname. There is a case where a field might be nonexistent
+        // but still can be calculated.
+        if (indexedFields.contains(fieldName) == false && ctx.reader().getFieldInfos().fieldInfo(fieldName) != null) {
+            return false;
+        }
+
+        NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
+        if (docCountValues.nextDoc() != NO_MORE_DOCS) {
+            // This segment has at least one document with the _doc_count field.
+            return false;
+        }
+
+        long docCountWithFieldName = ctx.reader().getDocCount(fieldName);
+        int totalDocCount = ctx.reader().maxDoc();
+
+        // The missing aggregation bucket will count the number of documents where the field name is
+        // either null or not present in that document. We are subtracting the documents where the field
+        // value is valid.
+        incrementBucketDocCount(0, totalDocCount - docCountWithFieldName);
+
+        return true;
+    }
+
     @Override
     public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
         return buildAggregationsForSingleBucket(

@@ -73,6 +73,7 @@
 import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
 import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
 import org.opensearch.search.aggregations.support.ValuesSource;
+import org.opensearch.search.aggregations.support.ValuesSourceConfig;
 import org.opensearch.search.internal.SearchContext;
 import org.opensearch.search.startree.StarTreeQueryHelper;
 import org.opensearch.search.startree.filter.DimensionFilter;
@@ -107,6 +108,18 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
     private final SetOnce<SortedSetDocValues> dvs = new SetOnce<>();
     protected int segmentsWithSingleValuedOrds = 0;
     protected int segmentsWithMultiValuedOrds = 0;
+    LongUnaryOperator globalOperator;
+    private final ValuesSourceConfig config;
+
+    /**
+     * Lookup global ordinals
+     *
+     * @opensearch.internal
+     */
+    public interface GlobalOrdLookupFunction {
+        BytesRef apply(long ord) throws IOException;
+    }
+
     protected CardinalityUpperBound cardinalityUpperBound;
 
     public GlobalOrdinalsStringTermsAggregator(
@@ -124,7 +137,8 @@ public GlobalOrdinalsStringTermsAggregator(
         SubAggCollectionMode collectionMode,
         boolean showTermDocCountError,
         CardinalityUpperBound cardinality,
-        Map<String, Object> metadata
+        Map<String, Object> metadata,
+        ValuesSourceConfig config
     ) throws IOException {
         super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
         this.cardinalityUpperBound = cardinality;
@@ -146,9 +160,8 @@ public GlobalOrdinalsStringTermsAggregator(
                 return new DenseGlobalOrds();
             });
         }
-        this.fieldName = (valuesSource instanceof ValuesSource.Bytes.WithOrdinals.FieldData)
-            ? ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).getIndexFieldName()
-            : null;
+        this.fieldName = valuesSource.getIndexFieldName();
+        this.config = config;
     }
 
     String descriptCollectionStrategy() {
@@ -185,6 +198,14 @@ boolean tryCollectFromTermFrequencies(LeafReaderContext ctx, BiConsumer<Long, In
             }
         }
 
+        // If the missing property is specified in the builder, and there are documents with the
+        // field missing, we might not be able to use the index unless there is a way to
+        // calculate which ordinal value that missing field is (something I am not sure how to
+        // do yet).
+        if (config != null && config.missing() != null && ((weight.count(ctx) == ctx.reader().getDocCount(fieldName)) == false)) {
+            return false;
+        }
+
         Terms segmentTerms = ctx.reader().terms(this.fieldName);
         if (segmentTerms == null) {
             // Field is not indexed.
@@ -482,7 +503,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
             boolean remapGlobalOrds,
             SubAggCollectionMode collectionMode,
             boolean showTermDocCountError,
-            Map<String, Object> metadata
+            Map<String, Object> metadata,
+            ValuesSourceConfig config
         ) throws IOException {
             super(
                 name,
@@ -499,7 +521,8 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator {
                 collectionMode,
                 showTermDocCountError,
                 CardinalityUpperBound.ONE,
-                metadata
+                metadata,
+                config
             );
             assert factories == null || factories.countAggregators() == 0;
             this.segmentDocCounts = context.bigArrays().newLongArray(1, true);

@@ -31,15 +31,21 @@
 
 package org.opensearch.search.aggregations.bucket.terms;
 
+import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.PriorityQueue;
 import org.opensearch.common.lease.Releasable;
 import org.opensearch.common.lease.Releasables;
 import org.opensearch.common.util.LongArray;
 import org.opensearch.index.fielddata.SortedBinaryDocValues;
+import org.opensearch.index.mapper.DocCountFieldMapper;
 import org.opensearch.search.DocValueFormat;
 import org.opensearch.search.aggregations.Aggregator;
 import org.opensearch.search.aggregations.AggregatorFactories;
@@ -54,6 +60,7 @@
 import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
 import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
 import org.opensearch.search.aggregations.support.ValuesSource;
+import org.opensearch.search.aggregations.support.ValuesSourceConfig;
 import org.opensearch.search.internal.SearchContext;
 
 import java.io.IOException;
@@ -65,6 +72,8 @@
 import java.util.function.Supplier;
 
 import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder;
+import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_DOCS;
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 /**
  * An aggregator of string values that hashes the strings on the fly rather
@@ -75,8 +84,11 @@
 public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
     private final CollectorSource collectorSource;
     private final ResultStrategy<?, ?> resultStrategy;
+    private Weight weight;
     private final BytesKeyedBucketOrds bucketOrds;
     private final IncludeExclude.StringFilter includeExclude;
+    protected final String fieldName;
+    private final ValuesSourceConfig config;
 
     public MapStringTermsAggregator(
         String name,
@@ -92,13 +104,25 @@
         SubAggCollectionMode collectionMode,
         boolean showTermDocCountError,
         CardinalityUpperBound cardinality,
-        Map<String, Object> metadata
+        Map<String, Object> metadata,
+        ValuesSourceConfig config
     ) throws IOException {
         super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
         this.collectorSource = collectorSource;
         this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
         this.includeExclude = includeExclude;
         bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), cardinality);
+        if (collectorSource instanceof ValuesSourceCollectorSource) {
+            ValuesSource valuesCollectorSource = ((ValuesSourceCollectorSource) collectorSource).getValuesSource();
+            this.fieldName = valuesCollectorSource.getIndexFieldName();
+        } else {
+            this.fieldName = null;
+        }
+        this.config = config;
+    }
+
+    public void setWeight(Weight weight) {
+        this.weight = weight;
     }
 
     @Override
@@ -130,6 +154,69 @@
         );
     }
 
+    @Override
+    protected boolean tryPrecomputeAggregationForLeaf(LeafReaderContext ctx) throws IOException {
+        // TODO: A note is that in scripted aggregations, the way of collecting from buckets is determined from
+        // the script aggregator. For now, we will not be able to support the script aggregation.
+
+        // The optimization does not work when there are subaggregations or if there is a filter.
+        // The query has to be a match all, otherwise
+        if (subAggregators.length > 0 || includeExclude != null || fieldName == null || weight == null) {
+            return false;
+        }
+
+        // The optimization could only be used if there are no deleted documents and the top-level
+        // query matches all documents in the segment.
+        if (weight.count(ctx) == 0) {
+            return true;
+        } else if (weight.count(ctx) != ctx.reader().maxDoc()) {
+            return false;
+        }
+
+        // If the missing property is specified in the builder, and there are documents with the
+        // field missing, we might not be able to use the index unless there is some way we can
+        // calculate which ordinal value that missing field is (something I am not sure how to
+        // do yet).
+        // Custom scripts cannot be supported because when the aggregation is returned, parts of the custom
+        // script are not included. See test 'org.opensearch.painless.\
+        // LangPainlessClientYamlTestSuiteIT.test {yaml=painless/100_terms_agg/String Value Script with doc notation}'
+        // for more details on why it cannot be supported.
+        if ((config != null)
+            && ((config.missing() != null && ((weight.count(ctx) != ctx.reader().getDocCount(fieldName)))) || (config.script() != null))) {
+            return false;
+        }
+
+        Terms stringTerms = ctx.reader().terms(fieldName);
+        if (stringTerms == null) {
+            // Field is not indexed.
+            return false;
+        }
+
+        NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
+        if (docCountValues.nextDoc() != NO_MORE_DOCS) {
+            // This segment has at least one document with the _doc_count field.
+            return false;
+        }
+
+        TermsEnum stringTermsEnum = stringTerms.iterator();
+        BytesRef stringTerm = stringTermsEnum.next();
+
+        // Here, we will iterate over all the terms in the segment and add the counts into the bucket.
+        while (stringTerm != null) {
+            long bucketOrdinal = bucketOrds.add(0L, stringTerm);
+            if (bucketOrdinal < 0) { // already seen
+                bucketOrdinal = -1 - bucketOrdinal;
+            }
+            int docCount = stringTermsEnum.docFreq();
+            if (resultStrategy instanceof SignificantTermsResults sigTermsResultStrategy) {
+                sigTermsResultStrategy.updateSubsetSizes(0L, docCount);
+            }
+            incrementBucketDocCount(bucketOrdinal, docCount);
+            stringTerm = stringTermsEnum.next();
+        }
+        return true;
+    }
+
     @Override
     public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
         return resultStrategy.buildAggregations(owningBucketOrds);
@@ -196,6 +283,10 @@
             return valuesSource.needsScores();
         }
 
+        public ValuesSource getValuesSource() {
+            return valuesSource;
+        }
+
         @Override
         public LeafBucketCollector getLeafCollector(
             IncludeExclude.StringFilter includeExclude,
@@ -502,6 +593,11 @@
             return "significant_terms";
         }
 
+        public void updateSubsetSizes(long owningBucketOrd, int amount) {
+            subsetSizes = context.bigArrays().grow(subsetSizes, owningBucketOrd + 1);
+            subsetSizes.increment(owningBucketOrd, amount);
+        }
+
         @Override
         LeafBucketCollector wrapCollector(LeafBucketCollector primary) {
             return new LeafBucketCollectorBase(primary, null) {