From f1af999575bf13742708b17231c8add545246553 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Fri, 14 Jun 2024 18:31:46 -0700 Subject: [PATCH 1/7] Optimize global ordinal includes/excludes for prefix matching If an aggregration specifies includes or excludes based on a regular expression, and the regular expression has a finite expansion followed by .*, then we can optimize the global ordinal filter. Specifically, in this case, we can expand the matching prefixes, then include/exclude the range of global ordinals that start with each prefix. Signed-off-by: Michael Froh --- .../bucket/terms/IncludeExclude.java | 188 +++++++++++++++++- 1 file changed, 187 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java index f8061bcaca50f..857fe89e30526 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java @@ -57,7 +57,11 @@ import org.opensearch.search.DocValueFormat; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Objects; import java.util.Set; import java.util.SortedSet; @@ -86,6 +90,10 @@ public class IncludeExclude implements Writeable, ToXContentFragment { * https://github.com/opensearch-project/OpenSearch/issues/2858 */ private static final int DEFAULT_MAX_REGEX_LENGTH = 1000; + /** + * The maximum number of prefixes to extract from a regex in tryCreatePrefixOrdinalsFilter + */ + private static final int MAX_PREFIXES = 1000; // for parsing purposes only // TODO: move all aggs to the same package so that this stuff could be pkg-private @@ -393,6 +401,91 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro } + /** + * An ordinals filter that includes/excludes all ordinals corresponding to terms starting with the given prefixes + */ + static class PrefixBackedOrdinalsFilter extends OrdinalsFilter { + + private final SortedSet includePrefixes, excludePrefixes; + + private PrefixBackedOrdinalsFilter(SortedSet includePrefixes, SortedSet excludePrefixes) { + this.includePrefixes = includePrefixes; + this.excludePrefixes = excludePrefixes; + } + + private static BytesRef nextBytesRef(BytesRef bytesRef) { + BytesRef next = BytesRef.deepCopyOf(bytesRef); + int pos = next.offset + next.length - 1; + while (pos >= next.offset && next.bytes[pos] == -1) { + next.bytes[pos] = 0; + pos--; + } + if (pos >= next.offset) { + next.bytes[pos]++; + } else { + // Every byte in our prefix had value 0xFF. We must match all subsequent ordinals. + return null; + } + return next; + } + + private interface LongLongBiconsumer { + void accept(long a, long b); + } + + private static void process( + SortedSetDocValues globalOrdinals, + long length, + SortedSet prefixes, + LongLongBiconsumer consumer + ) throws IOException { + for (BytesRef prefix : prefixes) { + long startOrd = globalOrdinals.lookupTerm(prefix); + if (startOrd < 0) { + // The prefix is not an exact match in the ordinals (can skip equal length below) + startOrd = -1 - startOrd; + // Make sure that the term at startOrd starts with prefix + BytesRef startTerm = globalOrdinals.lookupOrd(startOrd); + if (startTerm.length <= prefix.length + || !Arrays.equals( + startTerm.bytes, + startTerm.offset, + startTerm.offset + prefix.length, + prefix.bytes, + prefix.offset, + prefix.offset + prefix.length + )) { + continue; + } + } + if (startOrd >= length) { + continue; + } + BytesRef next = nextBytesRef(prefix); + if (next == null) { + consumer.accept(startOrd, length); + } else { + long endOrd = globalOrdinals.lookupTerm(next); + if (endOrd < 0) { + endOrd = -1 - endOrd; + } + if (startOrd < endOrd) { + consumer.accept(startOrd, endOrd); + } + } + } + + } + + @Override + public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException { + LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount()); + process(globalOrdinals, accept.length(), includePrefixes, accept::set); + process(globalOrdinals, accept.length(), excludePrefixes, accept::clear); + return accept; + } + } + private final String include, exclude; private final SortedSet includeValues, excludeValues; private final int incZeroBasedPartition; @@ -709,8 +802,13 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) { } public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRegexLength) { - if (isRegexBased()) { + if ((include == null || include.endsWith(".*")) && (exclude == null || exclude.endsWith(".*"))) { + PrefixBackedOrdinalsFilter prefixBackedOrdinalsFilter = tryCreatePrefixOrdinalsFilter(maxRegexLength); + if (prefixBackedOrdinalsFilter != null) { + return prefixBackedOrdinalsFilter; + } + } return new AutomatonBackedOrdinalsFilter(toAutomaton(maxRegexLength)); } if (isPartitionBased()) { @@ -720,6 +818,94 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRege return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format)); } + private static List expandRegexp(RegExp regExp, int maxPrefixes) { + switch (regExp.kind) { + case REGEXP_UNION: + List alternatives = new ArrayList<>(); + while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) { + alternatives.add(regExp.exp2); + regExp = regExp.exp1; + } + alternatives.add(regExp.exp2); + alternatives.add(regExp.exp1); + List output = new ArrayList<>(); + for (RegExp leaf : alternatives) { + List leafExpansions = expandRegexp(leaf, maxPrefixes); + if (leafExpansions == null) { + return null; + } else { + if (output.size() + leafExpansions.size() > maxPrefixes) { + return null; + } + output.addAll(leafExpansions); + } + } + return output; + case REGEXP_CONCATENATION: + List prefixes = expandRegexp(regExp.exp1, maxPrefixes); + if (prefixes == null) { + return null; + } + List suffixes = expandRegexp(regExp.exp2, maxPrefixes); + if (suffixes == null) { + return null; + } + if (prefixes.size() * suffixes.size() > maxPrefixes) { + return null; + } + List out = new ArrayList<>(); + StringBuilder stringBuilder = new StringBuilder(); + for (String prefix : prefixes) { + for (String suffix : suffixes) { + stringBuilder.setLength(0); + stringBuilder.append(prefix).append(suffix); + out.add(stringBuilder.toString()); + } + } + return out; + case REGEXP_CHAR: + return List.of(Character.toString(regExp.c)); + case REGEXP_STRING: + return List.of(regExp.s); + default: + return null; + } + } + + private static SortedSet extractPrefixes(String pattern, int maxRegexLength) { + if (pattern == null) { + return Collections.emptySortedSet(); + } + SortedSet prefixSet = null; + validateRegExpStringLength(pattern, maxRegexLength); + RegExp regExp = new RegExp(pattern); + if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION && regExp.exp2.kind == RegExp.Kind.REGEXP_REPEAT) { + RegExp tail = regExp.exp2.exp1; + if (tail.kind == RegExp.Kind.REGEXP_ANYCHAR || tail.kind == RegExp.Kind.REGEXP_ANYSTRING) { + List prefixes = expandRegexp(regExp.exp1, MAX_PREFIXES); + if (prefixes != null) { + prefixSet = new TreeSet<>(); + for (String prefix : prefixes) { + prefixSet.add(new BytesRef(prefix)); + } + } + } + } + return prefixSet; + } + + private PrefixBackedOrdinalsFilter tryCreatePrefixOrdinalsFilter(int maxRegexLength) { + SortedSet includeSet = extractPrefixes(include, maxRegexLength); + if (includeSet == null) { + return null; + } + SortedSet excludeSet = extractPrefixes(exclude, maxRegexLength); + if (excludeSet == null) { + return null; + } + return new PrefixBackedOrdinalsFilter(includeSet, excludeSet); + } + public LongFilter convertToLongFilter(DocValueFormat format) { if (isPartitionBased()) { From 82748e3f1b787b65ce7b251c31373cb87dddeea4 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Tue, 18 Jun 2024 10:14:47 -0700 Subject: [PATCH 2/7] Add unit test Signed-off-by: Michael Froh --- .../support/IncludeExcludeTests.java | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java b/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java index f223648de6ef4..5d7806c93e3fb 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java @@ -51,6 +51,7 @@ import java.io.IOException; import java.util.Collections; import java.util.TreeSet; +import java.util.concurrent.atomic.LongAdder; public class IncludeExcludeTests extends OpenSearchTestCase { @@ -297,4 +298,112 @@ private IncludeExclude serializeMixedRegex(IncludeExclude incExc) throws IOExcep } } + private static BytesRef[] toBytesRefArray(String... values) { + BytesRef[] bytesRefs = new BytesRef[values.length]; + for (int i = 0; i < values.length; i++) { + bytesRefs[i] = new BytesRef(values[i]); + } + return bytesRefs; + } + + public void testPrefixOrds() throws IOException { + IncludeExclude includeExclude = new IncludeExclude("(a|c|e).*", "abc.*"); + + OrdinalsFilter ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW); + + // Which of the following match the filter or not? + BytesRef[] bytesRefs = toBytesRefArray( + "a", // true + "ab", // true + "abc", // false (excluded) + "abcd", // false (excluded) + "b", // false + "ba", // false + "bac", // false + "c", // true + "cd", // true + "ce", // true + "d", // false + "de", // false + "def", // false + "e", // true + "ef", // true + "efg", // true + "f", // false + "fg", // false + "fgh" // false + ); + boolean[] expectedFilter = new boolean[] { + true, + true, + false, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + true, + true, + true, + false, + false, + false }; + + LongAdder lookupCount = new LongAdder(); + SortedSetDocValues sortedSetDocValues = new AbstractSortedSetDocValues() { + @Override + public boolean advanceExact(int target) { + return false; + } + + @Override + public long nextOrd() { + return 0; + } + + @Override + public int docValueCount() { + return 1; + } + + @Override + public BytesRef lookupOrd(long ord) { + lookupCount.increment(); + int ordIndex = Math.toIntExact(ord); + return bytesRefs[ordIndex]; + } + + @Override + public long getValueCount() { + return bytesRefs.length; + } + }; + + LongBitSet acceptedOrds = ordinalsFilter.acceptedGlobalOrdinals(sortedSetDocValues); + long prefixLookupCount = lookupCount.longValue(); + assertEquals(expectedFilter.length, acceptedOrds.length()); + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(expectedFilter[i], acceptedOrds.get(i)); + } + + // Now repeat, but this time, the prefix optimization won't work (because of the .+ on the exclude filter) + includeExclude = new IncludeExclude("(a|c|e).*", "ab.+"); + ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW); + acceptedOrds = ordinalsFilter.acceptedGlobalOrdinals(sortedSetDocValues); + long regexpLookupCount = lookupCount.longValue() - prefixLookupCount; + + // The filter should be functionally the same + assertEquals(expectedFilter.length, acceptedOrds.length()); + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(expectedFilter[i], acceptedOrds.get(i)); + } + + // But the full regexp requires more lookups + assertTrue(regexpLookupCount > prefixLookupCount); + } } From c477c59587d91d7638b707d0116f8b89bffae2ee Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Tue, 18 Jun 2024 10:25:22 -0700 Subject: [PATCH 3/7] Add changelog entry Signed-off-by: Michael Froh --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 663ae586a46a9..1c341a2c248d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979)) +- Optimize regexp-based include/exclude on aggregations when pattern matches prefixes ([#14371](https://github.com/opensearch-project/OpenSearch/pull/14371)) - Replace and block usages of org.apache.logging.log4j.util.Strings ([#15238](https://github.com/opensearch-project/OpenSearch/pull/15238)) ### Deprecated From f4db02fc6813c8fdcfb560dca8e32a3fef4d3845 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Tue, 18 Jun 2024 14:53:14 -0700 Subject: [PATCH 4/7] Improve test coverage Updated the unit test to be functionally equivalent, but it covers more of the regex logic. Signed-off-by: Michael Froh --- .../support/IncludeExcludeTests.java | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java b/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java index 5d7806c93e3fb..c747cae5ceb0b 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java @@ -307,31 +307,31 @@ private static BytesRef[] toBytesRefArray(String... values) { } public void testPrefixOrds() throws IOException { - IncludeExclude includeExclude = new IncludeExclude("(a|c|e).*", "abc.*"); + IncludeExclude includeExclude = new IncludeExclude("(color|length|size):.*", "color:g.*"); OrdinalsFilter ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW); // Which of the following match the filter or not? BytesRef[] bytesRefs = toBytesRefArray( - "a", // true - "ab", // true - "abc", // false (excluded) - "abcd", // false (excluded) - "b", // false - "ba", // false - "bac", // false - "c", // true - "cd", // true - "ce", // true - "d", // false - "de", // false - "def", // false - "e", // true - "ef", // true - "efg", // true - "f", // false - "fg", // false - "fgh" // false + "color:blue", // true + "color:crimson", // true + "color:green", // false (excluded) + "color:gray", // false (excluded) + "depth:10in", // false + "depth:12in", // false + "depth:17in", // false + "length:long", // true + "length:medium", // true + "length:short", // true + "material:cotton", // false + "material:linen", // false + "material:polyester", // false + "size:large", // true + "size:medium", // true + "size:small", // true + "width:13in", // false + "width:27in", // false + "width:55in" // false ); boolean[] expectedFilter = new boolean[] { true, @@ -355,6 +355,7 @@ public void testPrefixOrds() throws IOException { false }; LongAdder lookupCount = new LongAdder(); + LongAdder termsEnumAcceptCount = new LongAdder(); SortedSetDocValues sortedSetDocValues = new AbstractSortedSetDocValues() { @Override public boolean advanceExact(int target) { @@ -390,9 +391,10 @@ public long getValueCount() { for (int i = 0; i < expectedFilter.length; i++) { assertEquals(expectedFilter[i], acceptedOrds.get(i)); } + assertEquals(0, termsEnumAcceptCount.longValue()); // Now repeat, but this time, the prefix optimization won't work (because of the .+ on the exclude filter) - includeExclude = new IncludeExclude("(a|c|e).*", "ab.+"); + includeExclude = new IncludeExclude("(color|length|size):.*", "color:g.+"); ordinalsFilter = includeExclude.convertToOrdinalsFilter(DocValueFormat.RAW); acceptedOrds = ordinalsFilter.acceptedGlobalOrdinals(sortedSetDocValues); long regexpLookupCount = lookupCount.longValue() - prefixLookupCount; From 5abb4cfff661127862738c81a3e7662fbf687ea7 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Tue, 30 Jul 2024 17:41:14 -0700 Subject: [PATCH 5/7] Improve test coverage Signed-off-by: Michael Froh --- .../bucket/terms/IncludeExclude.java | 9 +++- .../terms}/IncludeExcludeTests.java | 49 ++++++++++++++++++- 2 files changed, 54 insertions(+), 4 deletions(-) rename server/src/test/java/org/opensearch/search/aggregations/{support => bucket/terms}/IncludeExcludeTests.java (87%) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java index 857fe89e30526..d2768a7fea7ed 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java @@ -480,7 +480,12 @@ private static void process( @Override public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException { LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount()); - process(globalOrdinals, accept.length(), includePrefixes, accept::set); + if (includePrefixes.isEmpty()) { + // Exclude-only + accept.set(0, accept.length()); + } else { + process(globalOrdinals, accept.length(), includePrefixes, accept::set); + } process(globalOrdinals, accept.length(), excludePrefixes, accept::clear); return accept; } @@ -872,7 +877,7 @@ private static List expandRegexp(RegExp regExp, int maxPrefixes) { } } - private static SortedSet extractPrefixes(String pattern, int maxRegexLength) { + static SortedSet extractPrefixes(String pattern, int maxRegexLength) { if (pattern == null) { return Collections.emptySortedSet(); } diff --git a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java similarity index 87% rename from server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java rename to server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java index c747cae5ceb0b..20d5f84c1a406 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/support/IncludeExcludeTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java @@ -30,7 +30,7 @@ * GitHub history for details. */ -package org.opensearch.search.aggregations.support; +package org.opensearch.search.aggregations.bucket.terms; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.SortedSetDocValues; @@ -44,12 +44,12 @@ import org.opensearch.core.xcontent.XContentParser; import org.opensearch.index.fielddata.AbstractSortedSetDocValues; import org.opensearch.search.DocValueFormat; -import org.opensearch.search.aggregations.bucket.terms.IncludeExclude; import org.opensearch.search.aggregations.bucket.terms.IncludeExclude.OrdinalsFilter; import org.opensearch.test.OpenSearchTestCase; import java.io.IOException; import java.util.Collections; +import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.atomic.LongAdder; @@ -408,4 +408,49 @@ public long getValueCount() { // But the full regexp requires more lookups assertTrue(regexpLookupCount > prefixLookupCount); } + + public void testExtractPrefixes() { + // Positive tests + assertEquals(bytesRefs("a"), IncludeExclude.extractPrefixes("a.*", 1000)); + assertEquals(bytesRefs("a", "b"), IncludeExclude.extractPrefixes("(a|b).*", 1000)); + assertEquals(bytesRefs("ab"), IncludeExclude.extractPrefixes("a(b).*", 1000)); + assertEquals(bytesRefs("ab", "ac"), IncludeExclude.extractPrefixes("a(b|c).*", 1000)); + assertEquals(bytesRefs("aabb", "aacc"), IncludeExclude.extractPrefixes("aa(bb|cc).*", 1000)); + + // No regex means no prefixes + assertEquals(bytesRefs(), IncludeExclude.extractPrefixes(null, 1000)); + + // Negative tests + assertNull(IncludeExclude.extractPrefixes(".*", 1000)); // Literal match-all has no prefixes + assertNull(IncludeExclude.extractPrefixes("ab?.*", 1000)); // Don't support optionals ye + // The following cover various cases involving infinite possible prefixes + assertNull(IncludeExclude.extractPrefixes("ab*.*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a*(b|c).*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a(b*|c)d.*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a(b|c*)d.*", 1000)); + assertNull(IncludeExclude.extractPrefixes("a(b|c*)d*.*", 1000)); + + // Test with too many possible prefixes -- 10 * 10 * 10 + 1 > 1000 + assertNull( + IncludeExclude.extractPrefixes( + "((a1|a2|a3|a4|a5|a6|a7|a8|a9|a10)" + "(b1|b2|b3|b4|b5|b6|b7|b8|b9|b10)" + "(c1|c2|c3|c4|c5|c6|c7|c8|c9|c10)|x).*", + 1000 + ) + ); + // Test with too many possible prefixes -- 10 * 10 * 11 > 1000 + assertNull( + IncludeExclude.extractPrefixes( + "((a1|a2|a3|a4|a5|a6|a7|a8|a9|a10)" + "(b1|b2|b3|b4|b5|b6|b7|b8|b9|b10)" + "(c1|c2|c3|c4|c5|c6|c7|c8|c9|c10|c11)).*", + 1000 + ) + ); + } + + private static SortedSet bytesRefs(String... strings) { + SortedSet bytesRefs = new TreeSet<>(); + for (String string : strings) { + bytesRefs.add(new BytesRef(string)); + } + return bytesRefs; + } } From 5a2c2dd67d245468620acd9ddf54e829a6510476 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Thu, 1 Aug 2024 10:21:46 -0700 Subject: [PATCH 6/7] Fix bug in exclude-only case with no doc values in segment Signed-off-by: Michael Froh --- .../search/aggregations/bucket/terms/IncludeExclude.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java index d2768a7fea7ed..de83c7221835f 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java @@ -480,11 +480,11 @@ private static void process( @Override public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException { LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount()); - if (includePrefixes.isEmpty()) { + if (!includePrefixes.isEmpty()) { + process(globalOrdinals, accept.length(), includePrefixes, accept::set); + } else if (accept.length() > 0) { // Exclude-only accept.set(0, accept.length()); - } else { - process(globalOrdinals, accept.length(), includePrefixes, accept::set); } process(globalOrdinals, accept.length(), excludePrefixes, accept::clear); return accept; From 1bc728a462d64bb3b4a7e59adaf62c8bebbcbd08 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Mon, 19 Aug 2024 18:01:35 -0700 Subject: [PATCH 7/7] Address comments from @mch2 Signed-off-by: Michael Froh --- .../bucket/terms/IncludeExclude.java | 10 +- .../bucket/terms/IncludeExcludeTests.java | 96 ++++++++++++++++++- 2 files changed, 97 insertions(+), 9 deletions(-) diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java index de83c7221835f..20f294bc7199b 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/IncludeExclude.java @@ -429,16 +429,12 @@ private static BytesRef nextBytesRef(BytesRef bytesRef) { return next; } - private interface LongLongBiconsumer { + private interface LongBiConsumer { void accept(long a, long b); } - private static void process( - SortedSetDocValues globalOrdinals, - long length, - SortedSet prefixes, - LongLongBiconsumer consumer - ) throws IOException { + private static void process(SortedSetDocValues globalOrdinals, long length, SortedSet prefixes, LongBiConsumer consumer) + throws IOException { for (BytesRef prefix : prefixes) { long startOrd = globalOrdinals.lookupTerm(prefix); if (startOrd < 0) { diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java index 20d5f84c1a406..2356e2dc6d855 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/IncludeExcludeTests.java @@ -355,7 +355,6 @@ public void testPrefixOrds() throws IOException { false }; LongAdder lookupCount = new LongAdder(); - LongAdder termsEnumAcceptCount = new LongAdder(); SortedSetDocValues sortedSetDocValues = new AbstractSortedSetDocValues() { @Override public boolean advanceExact(int target) { @@ -391,7 +390,6 @@ public long getValueCount() { for (int i = 0; i < expectedFilter.length; i++) { assertEquals(expectedFilter[i], acceptedOrds.get(i)); } - assertEquals(0, termsEnumAcceptCount.longValue()); // Now repeat, but this time, the prefix optimization won't work (because of the .+ on the exclude filter) includeExclude = new IncludeExclude("(color|length|size):.*", "color:g.+"); @@ -453,4 +451,98 @@ private static SortedSet bytesRefs(String... strings) { } return bytesRefs; } + + private static SortedSetDocValues toDocValues(BytesRef[] bytesRefs) { + return new AbstractSortedSetDocValues() { + @Override + public boolean advanceExact(int target) { + return false; + } + + @Override + public long nextOrd() { + return 0; + } + + @Override + public int docValueCount() { + return 1; + } + + @Override + public BytesRef lookupOrd(long ord) { + int ordIndex = Math.toIntExact(ord); + return bytesRefs[ordIndex]; + } + + @Override + public long getValueCount() { + return bytesRefs.length; + } + }; + } + + public void testOnlyIncludeExcludePrefix() throws IOException { + String incExcString = "(color:g|width:).*"; + IncludeExclude excludeOnly = new IncludeExclude(null, incExcString); + + OrdinalsFilter ordinalsFilter = excludeOnly.convertToOrdinalsFilter(DocValueFormat.RAW); + // Which of the following match the filter or not? + BytesRef[] bytesRefs = toBytesRefArray( + "color:blue", // true + "color:crimson", // true + "color:green", // false (excluded) + "color:gray", // false (excluded) + "depth:10in", // true + "depth:12in", // true + "depth:17in", // true + "length:long", // true + "length:medium", // true + "length:short", // true + "material:cotton", // true + "material:linen", // true + "material:polyester", // true + "size:large", // true + "size:medium", // true + "size:small", // true + "width:13in", // false + "width:27in", // false + "width:55in" // false + ); + boolean[] expectedFilter = new boolean[] { + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false }; + LongBitSet longBitSet = ordinalsFilter.acceptedGlobalOrdinals(toDocValues(bytesRefs)); + + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(expectedFilter[i], longBitSet.get(i)); + } + + // Now repeat, but this time include only. + IncludeExclude includeOnly = new IncludeExclude(incExcString, null); + ordinalsFilter = includeOnly.convertToOrdinalsFilter(DocValueFormat.RAW); + longBitSet = ordinalsFilter.acceptedGlobalOrdinals(toDocValues(bytesRefs)); + + // The previously excluded ordinals should be included and vice versa. + for (int i = 0; i < expectedFilter.length; i++) { + assertEquals(!expectedFilter[i], longBitSet.get(i)); + } + } }