From a99e985771f205a1df5dba9cd9f7b856060ff492 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 20 Aug 2024 17:07:51 +0000
Subject: [PATCH] Optimize global ordinal includes/excludes for prefix matching
 (#14371)

* Optimize global ordinal includes/excludes for prefix matching

If an aggregration specifies includes or excludes based on a regular
expression, and the regular expression has a finite expansion followed
by .*, then we can optimize the global ordinal filter.

Specifically, in this case, we can expand the matching prefixes, then
include/exclude the range of global ordinals that start with each
prefix.

Signed-off-by: Michael Froh <froh@amazon.com>

* Add unit test

Signed-off-by: Michael Froh <froh@amazon.com>

* Add changelog entry

Signed-off-by: Michael Froh <froh@amazon.com>

* Improve test coverage

Updated the unit test to be functionally equivalent, but it covers
more of the regex logic.

Signed-off-by: Michael Froh <froh@amazon.com>

* Improve test coverage

Signed-off-by: Michael Froh <froh@amazon.com>

* Fix bug in exclude-only case with no doc values in segment

Signed-off-by: Michael Froh <froh@amazon.com>

* Address comments from @mch2

Signed-off-by: Michael Froh <froh@amazon.com>

---------

Signed-off-by: Michael Froh <froh@amazon.com>
(cherry picked from commit 13163aba31fd3fef078897068040b0fc4c35b251)
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 CHANGELOG.md                                  |   1 +
 .../bucket/terms/IncludeExclude.java          | 189 ++++++++++++-
 .../terms}/IncludeExcludeTests.java           | 252 +++++++++++++++++-
 3 files changed, 439 insertions(+), 3 deletions(-)
 rename server/src/test/java/org/opensearch/search/aggregations/{support => bucket/terms}/IncludeExcludeTests.java (58%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9717d1cbdfe41..4a368eea4bf0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ### Changed
 - Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979))
+- Optimize regexp-based include/exclude on aggregations when pattern matches prefixes ([#14371](https://github.com/opensearch-project/OpenSearch/pull/14371))
 - Replace and block usages of org.apache.logging.log4j.util.Strings ([#15238](https://github.com/opensearch-project/OpenSearch/pull/15238))
 
 ### Deprecated
diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java
index f8061bcaca50f..20f294bc7199b 100644
--- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java
+++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java
@@ -57,7 +57,11 @@
 import org.opensearch.search.DocValueFormat;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Objects;
 import java.util.Set;
 import java.util.SortedSet;
@@ -86,6 +90,10 @@ public class IncludeExclude implements Writeable, ToXContentFragment {
      * https://github.com/opensearch-project/OpenSearch/issues/2858
      */
     private static final int DEFAULT_MAX_REGEX_LENGTH = 1000;
+    /**
+     * The maximum number of prefixes to extract from a regex in tryCreatePrefixOrdinalsFilter
+     */
+    private static final int MAX_PREFIXES = 1000;
 
     // for parsing purposes only
     // TODO: move all aggs to the same package so that this stuff could be pkg-private
@@ -393,6 +401,92 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
 
     }
 
+    /**
+     * An ordinals filter that includes/excludes all ordinals corresponding to terms starting with the given prefixes
+     */
+    static class PrefixBackedOrdinalsFilter extends OrdinalsFilter {
+
+        private final SortedSet<BytesRef> includePrefixes, excludePrefixes;
+
+        private PrefixBackedOrdinalsFilter(SortedSet<BytesRef> includePrefixes, SortedSet<BytesRef> excludePrefixes) {
+            this.includePrefixes = includePrefixes;
+            this.excludePrefixes = excludePrefixes;
+        }
+
+        private static BytesRef nextBytesRef(BytesRef bytesRef) {
+            BytesRef next = BytesRef.deepCopyOf(bytesRef);
+            int pos = next.offset + next.length - 1;
+            while (pos >= next.offset && next.bytes[pos] == -1) {
+                next.bytes[pos] = 0;
+                pos--;
+            }
+            if (pos >= next.offset) {
+                next.bytes[pos]++;
+            } else {
+                // Every byte in our prefix had value 0xFF. We must match all subsequent ordinals.
+                return null;
+            }
+            return next;
+        }
+
+        private interface LongBiConsumer {
+            void accept(long a, long b);
+        }
+
+        private static void process(SortedSetDocValues globalOrdinals, long length, SortedSet<BytesRef> prefixes, LongBiConsumer consumer)
+            throws IOException {
+            for (BytesRef prefix : prefixes) {
+                long startOrd = globalOrdinals.lookupTerm(prefix);
+                if (startOrd < 0) {
+                    // The prefix is not an exact match in the ordinals (can skip equal length below)
+                    startOrd = -1 - startOrd;
+                    // Make sure that the term at startOrd starts with prefix
+                    BytesRef startTerm = globalOrdinals.lookupOrd(startOrd);
+                    if (startTerm.length <= prefix.length
+                        || !Arrays.equals(
+                            startTerm.bytes,
+                            startTerm.offset,
+                            startTerm.offset + prefix.length,
+                            prefix.bytes,
+                            prefix.offset,
+                            prefix.offset + prefix.length
+                        )) {
+                        continue;
+                    }
+                }
+                if (startOrd >= length) {
+                    continue;
+                }
+                BytesRef next = nextBytesRef(prefix);
+                if (next == null) {
+                    consumer.accept(startOrd, length);
+                } else {
+                    long endOrd = globalOrdinals.lookupTerm(next);
+                    if (endOrd < 0) {
+                        endOrd = -1 - endOrd;
+                    }
+                    if (startOrd < endOrd) {
+                        consumer.accept(startOrd, endOrd);
+                    }
+                }
+            }
+
+        }
+
+        @Override
+        public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
+            LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount());
+            if (!includePrefixes.isEmpty()) {
+                process(globalOrdinals, accept.length(), includePrefixes, accept::set);
+            } else if (accept.length() > 0) {
+                // Exclude-only
+                accept.set(0, accept.length());
+            }
+            process(globalOrdinals, accept.length(), excludePrefixes, accept::clear);
+            return accept;
+        }
+    }
+
     private final String include, exclude;
     private final SortedSet<BytesRef> includeValues, excludeValues;
     private final int incZeroBasedPartition;
@@ -709,8 +803,13 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {
     }
 
     public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRegexLength) {
-
         if (isRegexBased()) {
+            if ((include == null || include.endsWith(".*")) && (exclude == null || exclude.endsWith(".*"))) {
+                PrefixBackedOrdinalsFilter prefixBackedOrdinalsFilter = tryCreatePrefixOrdinalsFilter(maxRegexLength);
+                if (prefixBackedOrdinalsFilter != null) {
+                    return prefixBackedOrdinalsFilter;
+                }
+            }
             return new AutomatonBackedOrdinalsFilter(toAutomaton(maxRegexLength));
         }
         if (isPartitionBased()) {
@@ -720,6 +819,94 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRege
         return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
     }
 
+    private static List<String> expandRegexp(RegExp regExp, int maxPrefixes) {
+        switch (regExp.kind) {
+            case REGEXP_UNION:
+                List<RegExp> alternatives = new ArrayList<>();
+                while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) {
+                    alternatives.add(regExp.exp2);
+                    regExp = regExp.exp1;
+                }
+                alternatives.add(regExp.exp2);
+                alternatives.add(regExp.exp1);
+                List<String> output = new ArrayList<>();
+                for (RegExp leaf : alternatives) {
+                    List<String> leafExpansions = expandRegexp(leaf, maxPrefixes);
+                    if (leafExpansions == null) {
+                        return null;
+                    } else {
+                        if (output.size() + leafExpansions.size() > maxPrefixes) {
+                            return null;
+                        }
+                        output.addAll(leafExpansions);
+                    }
+                }
+                return output;
+            case REGEXP_CONCATENATION:
+                List<String> prefixes = expandRegexp(regExp.exp1, maxPrefixes);
+                if (prefixes == null) {
+                    return null;
+                }
+                List<String> suffixes = expandRegexp(regExp.exp2, maxPrefixes);
+                if (suffixes == null) {
+                    return null;
+                }
+                if (prefixes.size() * suffixes.size() > maxPrefixes) {
+                    return null;
+                }
+                List<String> out = new ArrayList<>();
+                StringBuilder stringBuilder = new StringBuilder();
+                for (String prefix : prefixes) {
+                    for (String suffix : suffixes) {
+                        stringBuilder.setLength(0);
+                        stringBuilder.append(prefix).append(suffix);
+                        out.add(stringBuilder.toString());
+                    }
+                }
+                return out;
+            case REGEXP_CHAR:
+                return List.of(Character.toString(regExp.c));
+            case REGEXP_STRING:
+                return List.of(regExp.s);
+            default:
+                return null;
+        }
+    }
+
+    static SortedSet<BytesRef> extractPrefixes(String pattern, int maxRegexLength) {
+        if (pattern == null) {
+            return Collections.emptySortedSet();
+        }
+        SortedSet<BytesRef> prefixSet = null;
+        validateRegExpStringLength(pattern, maxRegexLength);
+        RegExp regExp = new RegExp(pattern);
+        if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION && regExp.exp2.kind == RegExp.Kind.REGEXP_REPEAT) {
+            RegExp tail = regExp.exp2.exp1;
+            if (tail.kind == RegExp.Kind.REGEXP_ANYCHAR || tail.kind == RegExp.Kind.REGEXP_ANYSTRING) {
+                List<String> prefixes = expandRegexp(regExp.exp1, MAX_PREFIXES);
+                if (prefixes != null) {
+                    prefixSet = new TreeSet<>();
+                    for (String prefix : prefixes) {
+                        prefixSet.add(new BytesRef(prefix));
+                    }
+                }
+            }
+        }
+        return prefixSet;
+    }
+
+    private PrefixBackedOrdinalsFilter tryCreatePrefixOrdinalsFilter(int maxRegexLength) {
+        SortedSet<BytesRef> includeSet = extractPrefixes(include, maxRegexLength);
+        if (includeSet == null) {
+            return null;
+        }
+        SortedSet<BytesRef> excludeSet = extractPrefixes(exclude, maxRegexLength);
+        if (excludeSet == null) {
+            return null;
+        }
+        return new PrefixBackedOrdinalsFilter(includeSet, excludeSet);
+    }
+
     public LongFilter convertToLongFilter(DocValueFormat format) {
 
         if (isPartitionBased()) {
diff --git a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java
similarity index 58%
rename from server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java
rename to server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java
index f223648de6ef4..2356e2dc6d855 100644
--- a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java
+++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java
@@ -30,7 +30,7 @@
  * GitHub history for details.
  */
 
-package org.opensearch.search.aggregations.support;
+package org.opensearch.search.aggregations.bucket.terms;
 
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.SortedSetDocValues;
@@ -44,13 +44,14 @@
 import org.opensearch.core.xcontent.XContentParser;
 import org.opensearch.index.fielddata.AbstractSortedSetDocValues;
 import org.opensearch.search.DocValueFormat;
-import org.opensearch.search.aggregations.bucket.terms.IncludeExclude;
 import org.opensearch.search.aggregations.bucket.terms.IncludeExclude.OrdinalsFilter;
 import org.opensearch.test.OpenSearchTestCase;
 
 import java.io.IOException;
 import java.util.Collections;
+import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.concurrent.atomic.LongAdder;
 
 public class IncludeExcludeTests extends OpenSearchTestCase {
 
@@ -297,4 +298,251 @@ private IncludeExclude serializeMixedRegex(IncludeExclude incExc) throws IOExcep
         }
     }
 
+    private static BytesRef[] toBytesRefArray(String... values) {
+        BytesRef[] bytesRefs = new BytesRef[values.length];
+        for (int i = 0; i < values.length; i++) {
+            bytesRefs[i] = new BytesRef(values[i]);
+        }
+        return bytesRefs;
+    }
+
+    public void testPrefixOrds() throws IOException {
+        IncludeExclude includeExclude = new IncludeExclude("(color|length|size):.*", "color:g.*");
+
+        OrdinalsFilter ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW);
+
+        // Which of the following match the filter or not?
+        BytesRef[] bytesRefs = toBytesRefArray(
+            "color:blue", // true
+            "color:crimson", // true
+            "color:green", // false (excluded)
+            "color:gray", // false (excluded)
+            "depth:10in", // false
+            "depth:12in", // false
+            "depth:17in", // false
+            "length:long", // true
+            "length:medium", // true
+            "length:short", // true
+            "material:cotton", // false
+            "material:linen", // false
+            "material:polyester", // false
+            "size:large", // true
+            "size:medium", // true
+            "size:small", // true
+            "width:13in", // false
+            "width:27in", // false
+            "width:55in" // false
+        );
+        boolean[] expectedFilter = new boolean[] {
+            true,
+            true,
+            false,
+            false,
+            false,
+            false,
+            false,
+            true,
+            true,
+            true,
+            false,
+            false,
+            false,
+            true,
+            true,
+            true,
+            false,
+            false,
+            false };
+
+        LongAdder lookupCount = new LongAdder();
+        SortedSetDocValues sortedSetDocValues = new AbstractSortedSetDocValues() {
+            @Override
+            public boolean advanceExact(int target) {
+                return false;
+            }
+
+            @Override
+            public long nextOrd() {
+                return 0;
+            }
+
+            @Override
+            public int docValueCount() {
+                return 1;
+            }
+
+            @Override
+            public BytesRef lookupOrd(long ord) {
+                lookupCount.increment();
+                int ordIndex = Math.toIntExact(ord);
+                return bytesRefs[ordIndex];
+            }
+
+            @Override
+            public long getValueCount() {
+                return bytesRefs.length;
+            }
+        };
+
+        LongBitSet acceptedOrds = ordinalsFilter.acceptedGlobalOrdinals(sortedSetDocValues);
+        long prefixLookupCount = lookupCount.longValue();
+        assertEquals(expectedFilter.length, acceptedOrds.length());
+        for (int i = 0; i < expectedFilter.length; i++) {
+            assertEquals(expectedFilter[i], acceptedOrds.get(i));
+        }
+
+        // Now repeat, but this time, the prefix optimization won't work (because of the .+ on the exclude filter)
+        includeExclude = new IncludeExclude("(color|length|size):.*", "color:g.+");
+        ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW);
+        acceptedOrds = ordinalsFilter.acceptedGlobalOrdinals(sortedSetDocValues);
+        long regexpLookupCount = lookupCount.longValue() - prefixLookupCount;
+
+        // The filter should be functionally the same
+        assertEquals(expectedFilter.length, acceptedOrds.length());
+        for (int i = 0; i < expectedFilter.length; i++) {
+            assertEquals(expectedFilter[i], acceptedOrds.get(i));
+        }
+
+        // But the full regexp requires more lookups
+        assertTrue(regexpLookupCount > prefixLookupCount);
+    }
+
+    public void testExtractPrefixes() {
+        // Positive tests
+        assertEquals(bytesRefs("a"), IncludeExclude.extractPrefixes("a.*", 1000));
+        assertEquals(bytesRefs("a", "b"), IncludeExclude.extractPrefixes("(a|b).*", 1000));
+        assertEquals(bytesRefs("ab"), IncludeExclude.extractPrefixes("a(b).*", 1000));
+        assertEquals(bytesRefs("ab", "ac"), IncludeExclude.extractPrefixes("a(b|c).*", 1000));
+        assertEquals(bytesRefs("aabb", "aacc"), IncludeExclude.extractPrefixes("aa(bb|cc).*", 1000));
+
+        // No regex means no prefixes
+        assertEquals(bytesRefs(), IncludeExclude.extractPrefixes(null, 1000));
+
+        // Negative tests
+        assertNull(IncludeExclude.extractPrefixes(".*", 1000)); // Literal match-all has no prefixes
+        assertNull(IncludeExclude.extractPrefixes("ab?.*", 1000)); // Don't support optionals ye
+        // The following cover various cases involving infinite possible prefixes
+        assertNull(IncludeExclude.extractPrefixes("ab*.*", 1000));
+        assertNull(IncludeExclude.extractPrefixes("a*(b|c).*", 1000));
+        assertNull(IncludeExclude.extractPrefixes("a(b*|c)d.*", 1000));
+        assertNull(IncludeExclude.extractPrefixes("a(b|c*)d.*", 1000));
+        assertNull(IncludeExclude.extractPrefixes("a(b|c*)d*.*", 1000));
+
+        // Test with too many possible prefixes -- 10 * 10 * 10 + 1 > 1000
+        assertNull(
+            IncludeExclude.extractPrefixes(
+                "((a1|a2|a3|a4|a5|a6|a7|a8|a9|a10)" + "(b1|b2|b3|b4|b5|b6|b7|b8|b9|b10)" + "(c1|c2|c3|c4|c5|c6|c7|c8|c9|c10)|x).*",
+                1000
+            )
+        );
+        // Test with too many possible prefixes -- 10 * 10 * 11 > 1000
+        assertNull(
+            IncludeExclude.extractPrefixes(
+                "((a1|a2|a3|a4|a5|a6|a7|a8|a9|a10)" + "(b1|b2|b3|b4|b5|b6|b7|b8|b9|b10)" + "(c1|c2|c3|c4|c5|c6|c7|c8|c9|c10|c11)).*",
+                1000
+            )
+        );
+    }
+
+    private static SortedSet<BytesRef> bytesRefs(String... strings) {
+        SortedSet<BytesRef> bytesRefs = new TreeSet<>();
+        for (String string : strings) {
+            bytesRefs.add(new BytesRef(string));
+        }
+        return bytesRefs;
+    }
+
+    private static SortedSetDocValues toDocValues(BytesRef[] bytesRefs) {
+        return new AbstractSortedSetDocValues() {
+            @Override
+            public boolean advanceExact(int target) {
+                return false;
+            }
+
+            @Override
+            public long nextOrd() {
+                return 0;
+            }
+
+            @Override
+            public int docValueCount() {
+                return 1;
+            }
+
+            @Override
+            public BytesRef lookupOrd(long ord) {
+                int ordIndex = Math.toIntExact(ord);
+                return bytesRefs[ordIndex];
+            }
+
+            @Override
+            public long getValueCount() {
+                return bytesRefs.length;
+            }
+        };
+    }
+
+    public void testOnlyIncludeExcludePrefix() throws IOException {
+        String incExcString = "(color:g|width:).*";
+        IncludeExclude excludeOnly = new IncludeExclude(null, incExcString);
+
+        OrdinalsFilter ordinalsFilter = excludeOnly.convertToOrdinalsFilter(DocValueFormat.RAW);
+        // Which of the following match the filter or not?
+        BytesRef[] bytesRefs = toBytesRefArray(
+            "color:blue", // true
+            "color:crimson", // true
+            "color:green", // false (excluded)
+            "color:gray", // false (excluded)
+            "depth:10in", // true
+            "depth:12in", // true
+            "depth:17in", // true
+            "length:long", // true
+            "length:medium", // true
+            "length:short", // true
+            "material:cotton", // true
+            "material:linen", // true
+            "material:polyester", // true
+            "size:large", // true
+            "size:medium", // true
+            "size:small", // true
+            "width:13in", // false
+            "width:27in", // false
+            "width:55in" // false
+        );
+        boolean[] expectedFilter = new boolean[] {
+            true,
+            true,
+            false,
+            false,
+            true,
+            true,
+            true,
+            true,
+            true,
+            true,
+            true,
+            true,
+            true,
+            true,
+            true,
+            true,
+            false,
+            false,
+            false };
+        LongBitSet longBitSet = ordinalsFilter.acceptedGlobalOrdinals(toDocValues(bytesRefs));
+
+        for (int i = 0; i < expectedFilter.length; i++) {
+            assertEquals(expectedFilter[i], longBitSet.get(i));
+        }
+
+        // Now repeat, but this time include only.
+        IncludeExclude includeOnly = new IncludeExclude(incExcString, null);
+        ordinalsFilter = includeOnly.convertToOrdinalsFilter(DocValueFormat.RAW);
+        longBitSet = ordinalsFilter.acceptedGlobalOrdinals(toDocValues(bytesRefs));
+
+        // The previously excluded ordinals should be included and vice versa.
+        for (int i = 0; i < expectedFilter.length; i++) {
+            assertEquals(!expectedFilter[i], longBitSet.get(i));
+        }
+    }
 }