From a99e985771f205a1df5dba9cd9f7b856060ff492 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 20 Aug 2024 17:07:51 +0000 Subject: [PATCH] Optimize global ordinal includes/excludes for prefix matching (#14371) * Optimize global ordinal includes/excludes for prefix matching If an aggregration specifies includes or excludes based on a regular expression, and the regular expression has a finite expansion followed by .*, then we can optimize the global ordinal filter. Specifically, in this case, we can expand the matching prefixes, then include/exclude the range of global ordinals that start with each prefix. Signed-off-by: Michael Froh * Add unit test Signed-off-by: Michael Froh * Add changelog entry Signed-off-by: Michael Froh * Improve test coverage Updated the unit test to be functionally equivalent, but it covers more of the regex logic. Signed-off-by: Michael Froh * Improve test coverage Signed-off-by: Michael Froh * Fix bug in exclude-only case with no doc values in segment Signed-off-by: Michael Froh * Address comments from @mch2 Signed-off-by: Michael Froh --------- Signed-off-by: Michael Froh (cherry picked from commit 13163aba31fd3fef078897068040b0fc4c35b251) Signed-off-by: github-actions[bot] --- CHANGELOG.md | 1 + .../bucket/terms/IncludeExclude.java | 189 ++++++++++++- .../terms}/IncludeExcludeTests.java | 252 +++++++++++++++++- 3 files changed, 439 insertions(+), 3 deletions(-) rename server/src/test/java/org/opensearch/search/aggregations/{support => bucket/terms}/IncludeExcludeTests.java (58%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9717d1cbdfe41..4a368eea4bf0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979)) +- Optimize regexp-based include/exclude on aggregations when pattern matches prefixes ([#14371](https://github.com/opensearch-project/OpenSearch/pull/14371)) - Replace and block usages of org.apache.logging.log4j.util.Strings ([#15238](https://github.com/opensearch-project/OpenSearch/pull/15238)) ### Deprecated diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java index f8061bcaca50f..20f294bc7199b 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java @@ -57,7 +57,11 @@ import org.opensearch.search.DocValueFormat; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Objects; import java.util.Set; import java.util.SortedSet; @@ -86,6 +90,10 @@ public class IncludeExclude implements Writeable, ToXContentFragment { * https://github.com/opensearch-project/OpenSearch/issues/2858 */ private static final int DEFAULT_MAX_REGEX_LENGTH = 1000; + /** + * The maximum number of prefixes to extract from a regex in tryCreatePrefixOrdinalsFilter + */ + private static final int MAX_PREFIXES = 1000; // for parsing purposes only // TODO: move all aggs to the same package so that this stuff could be pkg-private @@ -393,6 +401,92 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro } + /** + * An ordinals filter that includes/excludes all ordinals corresponding to terms starting with the given prefixes + */ + static class PrefixBackedOrdinalsFilter extends OrdinalsFilter { + + private final SortedSet includePrefixes, excludePrefixes; + + private PrefixBackedOrdinalsFilter(SortedSet includePrefixes, SortedSet excludePrefixes) { + this.includePrefixes = includePrefixes; + this.excludePrefixes = excludePrefixes; + } + + private static BytesRef nextBytesRef(BytesRef bytesRef) { + BytesRef next = BytesRef.deepCopyOf(bytesRef); + int pos = next.offset + next.length - 1; + while (pos >= next.offset && next.bytes[pos] == -1) { + next.bytes[pos] = 0; + pos--; + } + if (pos >= next.offset) { + next.bytes[pos]++; + } else { + // Every byte in our prefix had value 0xFF. We must match all subsequent ordinals. + return null; + } + return next; + } + + private interface LongBiConsumer { + void accept(long a, long b); + } + + private static void process(SortedSetDocValues globalOrdinals, long length, SortedSet prefixes, LongBiConsumer consumer) + throws IOException { + for (BytesRef prefix : prefixes) { + long startOrd = globalOrdinals.lookupTerm(prefix); + if (startOrd < 0) { + // The prefix is not an exact match in the ordinals (can skip equal length below) + startOrd = -1 - startOrd; + // Make sure that the term at startOrd starts with prefix + BytesRef startTerm = globalOrdinals.lookupOrd(startOrd); + if (startTerm.length <= prefix.length + || !Arrays.equals( + startTerm.bytes, + startTerm.offset, + startTerm.offset + prefix.length, + prefix.bytes, + prefix.offset, + prefix.offset + prefix.length + )) { + continue; + } + } + if (startOrd >= length) { + continue; + } + BytesRef next = nextBytesRef(prefix); + if (next == null) { + consumer.accept(startOrd, length); + } else { + long endOrd = globalOrdinals.lookupTerm(next); + if (endOrd < 0) { + endOrd = -1 - endOrd; + } + if (startOrd < endOrd) { + consumer.accept(startOrd, endOrd); + } + } + } + + } + + @Override + public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException { + LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount()); + if (!includePrefixes.isEmpty()) { + process(globalOrdinals, accept.length(), includePrefixes, accept::set); + } else if (accept.length() > 0) { + // Exclude-only + accept.set(0, accept.length()); + } + process(globalOrdinals, accept.length(), excludePrefixes, accept::clear); + return accept; + } + } + private final String include, exclude; private final SortedSet includeValues, excludeValues; private final int incZeroBasedPartition; @@ -709,8 +803,13 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) { } public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRegexLength) { - if (isRegexBased()) { + if ((include == null || include.endsWith(".*")) && (exclude == null || exclude.endsWith(".*"))) { + PrefixBackedOrdinalsFilter prefixBackedOrdinalsFilter = tryCreatePrefixOrdinalsFilter(maxRegexLength); + if (prefixBackedOrdinalsFilter != null) { + return prefixBackedOrdinalsFilter; + } + } return new AutomatonBackedOrdinalsFilter(toAutomaton(maxRegexLength)); } if (isPartitionBased()) { @@ -720,6 +819,94 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRege return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format)); } + private static List expandRegexp(RegExp regExp, int maxPrefixes) { + switch (regExp.kind) { + case REGEXP_UNION: + List alternatives = new ArrayList<>(); + while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) { + alternatives.add(regExp.exp2); + regExp = regExp.exp1; + } + alternatives.add(regExp.exp2); + alternatives.add(regExp.exp1); + List output = new ArrayList<>(); + for (RegExp leaf : alternatives) { + List leafExpansions = expandRegexp(leaf, maxPrefixes); + if (leafExpansions == null) { + return null; + } else { + if (output.size() + leafExpansions.size() > maxPrefixes) { + return null; + } + output.addAll(leafExpansions); + } + } + return output; + case REGEXP_CONCATENATION: + List prefixes = expandRegexp(regExp.exp1, maxPrefixes); + if (prefixes == null) { + return null; + } + List suffixes = expandRegexp(regExp.exp2, maxPrefixes); + if (suffixes == null) { + return null; + } + if (prefixes.size() * suffixes.size() > maxPrefixes) { + return null; + } + List out = new ArrayList<>(); + StringBuilder stringBuilder = new StringBuilder(); + for (String prefix : prefixes) { + for (String suffix : suffixes) { + stringBuilder.setLength(0); + stringBuilder.append(prefix).append(suffix); + out.add(stringBuilder.toString()); + } + } + return out; + case REGEXP_CHAR: + return List.of(Character.toString(regExp.c)); + case REGEXP_STRING: + return List.of(regExp.s); + default: + return null; + } + } + + static SortedSet extractPrefixes(String pattern, int maxRegexLength) { + if (pattern == null) { + return Collections.emptySortedSet(); + } + SortedSet prefixSet = null; + validateRegExpStringLength(pattern, maxRegexLength); + RegExp regExp = new RegExp(pattern); + if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION && regExp.exp2.kind == RegExp.Kind.REGEXP_REPEAT) { + RegExp tail = regExp.exp2.exp1; + if (tail.kind == RegExp.Kind.REGEXP_ANYCHAR || tail.kind == RegExp.Kind.REGEXP_ANYSTRING) { + List prefixes = expandRegexp(regExp.exp1, MAX_PREFIXES); + if (prefixes != null) { + prefixSet = new TreeSet<>(); + for (String prefix : prefixes) { + prefixSet.add(new BytesRef(prefix)); + } + } + } + } + return prefixSet; + } + + private PrefixBackedOrdinalsFilter tryCreatePrefixOrdinalsFilter(int maxRegexLength) { + SortedSet includeSet = extractPrefixes(include, maxRegexLength); + if (includeSet == null) { + return null; + } + SortedSet excludeSet = extractPrefixes(exclude, maxRegexLength); + if (excludeSet == null) { + return null; + } + return new PrefixBackedOrdinalsFilter(includeSet, excludeSet); + } + public LongFilter convertToLongFilter(DocValueFormat format) { if (isPartitionBased()) { diff --git a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java similarity index 58% rename from server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java rename to server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java index f223648de6ef4..2356e2dc6d855 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java @@ -30,7 +30,7 @@ * GitHub history for details. */ -package org.opensearch.search.aggregations.support; +package org.opensearch.search.aggregations.bucket.terms; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.SortedSetDocValues; @@ -44,13 +44,14 @@ import org.opensearch.core.xcontent.XContentParser; import org.opensearch.index.fielddata.AbstractSortedSetDocValues; import org.opensearch.search.DocValueFormat; -import org.opensearch.search.aggregations.bucket.terms.IncludeExclude; import org.opensearch.search.aggregations.bucket.terms.IncludeExclude.OrdinalsFilter; import org.opensearch.test.OpenSearchTestCase; import java.io.IOException; import java.util.Collections; +import java.util.SortedSet; import java.util.TreeSet; +import java.util.concurrent.atomic.LongAdder; public class IncludeExcludeTests extends OpenSearchTestCase { @@ -297,4 +298,251 @@ private IncludeExclude serializeMixedRegex(IncludeExclude incExc) throws IOExcep } } + private static BytesRef[] toBytesRefArray(String... values) { + BytesRef[] bytesRefs = new BytesRef[values.length]; + for (int i = 0; i < values.length; i++) { + bytesRefs[i] = new BytesRef(values[i]); + } + return bytesRefs; + } + + public void testPrefixOrds() throws IOException { + IncludeExclude includeExclude = new IncludeExclude("(color|length|size):.*", "color:g.*"); + + OrdinalsFilter ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW); + + // Which of the following match the filter or not? + BytesRef[] bytesRefs = toBytesRefArray( + "color:blue", // true + "color:crimson", // true + "color:green", // false (excluded) + "color:gray", // false (excluded) + "depth:10in", // false + "depth:12in", // false + "depth:17in", // false + "length:long", // true + "length:medium", // true + "length:short", // true + "material:cotton", // false + "material:linen", // false + "material:polyester", // false + "size:large", // true + "size:medium", // true + "size:small", // true + "width:13in", // false + "width:27in", // false + "width:55in" // false + ); + boolean[] expectedFilter = new boolean[] { + true, + true, + false, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + true, + true, + true, + false, + false, + false }; + + LongAdder lookupCount = new LongAdder(); + SortedSetDocValues sortedSetDocValues = new AbstractSortedSetDocValues() { + @Override + public boolean advanceExact(int target) { + return false; + } + + @Override + public long nextOrd() { + return 0; + } + + @Override + public int docValueCount() { + return 1; + } + + @Override + public BytesRef lookupOrd(long ord) { + lookupCount.increment(); + int ordIndex = Math.toIntExact(ord); + return bytesRefs[ordIndex]; + } + + @Override + public long getValueCount() { + return bytesRefs.length; + } + }; + + LongBitSet acceptedOrds = ordinalsFilter.acceptedGlobalOrdinals(sortedSetDocValues); + long prefixLookupCount = lookupCount.longValue(); + assertEquals(expectedFilter.length, acceptedOrds.length()); + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(expectedFilter[i], acceptedOrds.get(i)); + } + + // Now repeat, but this time, the prefix optimization won't work (because of the .+ on the exclude filter) + includeExclude = new IncludeExclude("(color|length|size):.*", "color:g.+"); + ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW); + acceptedOrds = ordinalsFilter.acceptedGlobalOrdinals(sortedSetDocValues); + long regexpLookupCount = lookupCount.longValue() - prefixLookupCount; + + // The filter should be functionally the same + assertEquals(expectedFilter.length, acceptedOrds.length()); + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(expectedFilter[i], acceptedOrds.get(i)); + } + + // But the full regexp requires more lookups + assertTrue(regexpLookupCount > prefixLookupCount); + } + + public void testExtractPrefixes() { + // Positive tests + assertEquals(bytesRefs("a"), IncludeExclude.extractPrefixes("a.*", 1000)); + assertEquals(bytesRefs("a", "b"), IncludeExclude.extractPrefixes("(a|b).*", 1000)); + assertEquals(bytesRefs("ab"), IncludeExclude.extractPrefixes("a(b).*", 1000)); + assertEquals(bytesRefs("ab", "ac"), IncludeExclude.extractPrefixes("a(b|c).*", 1000)); + assertEquals(bytesRefs("aabb", "aacc"), IncludeExclude.extractPrefixes("aa(bb|cc).*", 1000)); + + // No regex means no prefixes + assertEquals(bytesRefs(), IncludeExclude.extractPrefixes(null, 1000)); + + // Negative tests + assertNull(IncludeExclude.extractPrefixes(".*", 1000)); // Literal match-all has no prefixes + assertNull(IncludeExclude.extractPrefixes("ab?.*", 1000)); // Don't support optionals ye + // The following cover various cases involving infinite possible prefixes + assertNull(IncludeExclude.extractPrefixes("ab*.*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a*(b|c).*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a(b*|c)d.*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a(b|c*)d.*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a(b|c*)d*.*", 1000)); + + // Test with too many possible prefixes -- 10 * 10 * 10 + 1 > 1000 + assertNull( + IncludeExclude.extractPrefixes( + "((a1|a2|a3|a4|a5|a6|a7|a8|a9|a10)" + "(b1|b2|b3|b4|b5|b6|b7|b8|b9|b10)" + "(c1|c2|c3|c4|c5|c6|c7|c8|c9|c10)|x).*", + 1000 + ) + ); + // Test with too many possible prefixes -- 10 * 10 * 11 > 1000 + assertNull( + IncludeExclude.extractPrefixes( + "((a1|a2|a3|a4|a5|a6|a7|a8|a9|a10)" + "(b1|b2|b3|b4|b5|b6|b7|b8|b9|b10)" + "(c1|c2|c3|c4|c5|c6|c7|c8|c9|c10|c11)).*", + 1000 + ) + ); + } + + private static SortedSet bytesRefs(String... strings) { + SortedSet bytesRefs = new TreeSet<>(); + for (String string : strings) { + bytesRefs.add(new BytesRef(string)); + } + return bytesRefs; + } + + private static SortedSetDocValues toDocValues(BytesRef[] bytesRefs) { + return new AbstractSortedSetDocValues() { + @Override + public boolean advanceExact(int target) { + return false; + } + + @Override + public long nextOrd() { + return 0; + } + + @Override + public int docValueCount() { + return 1; + } + + @Override + public BytesRef lookupOrd(long ord) { + int ordIndex = Math.toIntExact(ord); + return bytesRefs[ordIndex]; + } + + @Override + public long getValueCount() { + return bytesRefs.length; + } + }; + } + + public void testOnlyIncludeExcludePrefix() throws IOException { + String incExcString = "(color:g|width:).*"; + IncludeExclude excludeOnly = new IncludeExclude(null, incExcString); + + OrdinalsFilter ordinalsFilter = excludeOnly.convertToOrdinalsFilter(DocValueFormat.RAW); + // Which of the following match the filter or not? + BytesRef[] bytesRefs = toBytesRefArray( + "color:blue", // true + "color:crimson", // true + "color:green", // false (excluded) + "color:gray", // false (excluded) + "depth:10in", // true + "depth:12in", // true + "depth:17in", // true + "length:long", // true + "length:medium", // true + "length:short", // true + "material:cotton", // true + "material:linen", // true + "material:polyester", // true + "size:large", // true + "size:medium", // true + "size:small", // true + "width:13in", // false + "width:27in", // false + "width:55in" // false + ); + boolean[] expectedFilter = new boolean[] { + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false }; + LongBitSet longBitSet = ordinalsFilter.acceptedGlobalOrdinals(toDocValues(bytesRefs)); + + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(expectedFilter[i], longBitSet.get(i)); + } + + // Now repeat, but this time include only. + IncludeExclude includeOnly = new IncludeExclude(incExcString, null); + ordinalsFilter = includeOnly.convertToOrdinalsFilter(DocValueFormat.RAW); + longBitSet = ordinalsFilter.acceptedGlobalOrdinals(toDocValues(bytesRefs)); + + // The previously excluded ordinals should be included and vice versa. + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(!expectedFilter[i], longBitSet.get(i)); + } + } }