From f47573dd6c0bfa5ff27c914a2231b0348d880afa Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Thu, 30 May 2024 00:17:13 +0000 Subject: [PATCH] Add efficient first-phase implementation for regexp queries Following the description in https://swtch.com/~rsc/regexp/regexp4.html, along with the fact that Lucene gives us a regular expression parse tree, we *can* implement an efficient first-phase match on regular expressions without needing to write a lot of code. Signed-off-by: Michael Froh --- .../index/mapper/WildcardFieldMapper.java | 89 ++++++++++++++++--- ...est.java => WildcardFieldMapperTests.java} | 10 +-- ...eTest.java => WildcardFieldTypeTests.java} | 60 +++++++++++-- 3 files changed, 137 insertions(+), 22 deletions(-) rename server/src/test/java/org/opensearch/index/mapper/{WildcardFieldMapperTest.java => WildcardFieldMapperTests.java} (88%) rename server/src/test/java/org/opensearch/index/mapper/{WildcardFieldTypeTest.java => WildcardFieldTypeTests.java} (50%) diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java index ec5bfb18550cd..14fd3c174edb7 100644 --- a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java @@ -34,6 +34,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; import org.opensearch.common.lucene.BytesRefs; import org.opensearch.common.lucene.Lucene; import org.opensearch.common.regex.Regex; @@ -52,6 +53,7 @@ import java.io.IOException; import java.io.StringReader; import java.io.UncheckedIOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; @@ -62,7 +64,6 @@ import java.util.Set; import java.util.function.Predicate; import java.util.function.Supplier; -import java.util.regex.Pattern; import static org.opensearch.index.mapper.KeywordFieldMapper.normalizeValue; @@ -500,17 +501,76 @@ public Query regexpQuery( MultiTermQuery.RewriteMethod method, QueryShardContext context ) { - // TODO -- Extracting mandatory characters from a regex is not trivial, since entire blocks may be optional. - // It is functionally correct to approximate with MatchAllDocs, but performance won't be good. - return new WildcardMatchingQuery( - name(), - new MatchAllDocsQuery(), - Pattern.compile(value).asMatchPredicate(), - "/" + value + "/", - context, - this - ); - + RegExp regExp = new RegExp(value, syntaxFlags, matchFlags); + Automaton automaton = regExp.toAutomaton(maxDeterminizedStates); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton); + + return new WildcardMatchingQuery(name(), regexpToQuery(name(), regExp), s -> { + BytesRef valueBytes = BytesRefs.toBytesRef(s); + return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length); + }, "/" + value + "/", context, this); + } + + /** + * Implement the match rules described in Regular Expression Matching with a Trigram Index. + * + * @param fieldName name of the wildcard field + * @param regExp a parsed node in the {@link RegExp} tree + * @return a query that matches on the known required parts of the given regular expression + */ + private static Query regexpToQuery(String fieldName, RegExp regExp) { + Query query; + if (Objects.requireNonNull(regExp.kind) == RegExp.Kind.REGEXP_UNION) { + List clauses = new ArrayList<>(); + while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) { + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + regExp = regExp.exp1; + } + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + clauses.add(regexpToQuery(fieldName, regExp.exp1)); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (int i = clauses.size() - 1; i >= 0; i--) { + Query clause = clauses.get(i); + if (clause instanceof MatchAllDocsQuery) { + return clause; + } + builder.add(clause, BooleanClause.Occur.SHOULD); + } + query = builder.build(); + } else if (regExp.kind == RegExp.Kind.REGEXP_STRING) { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String string : getRequiredNGrams("*" + regExp.s + "*")) { + builder.add(new TermQuery(new Term(fieldName, string)), BooleanClause.Occur.FILTER); + } + query = builder.build(); + } else if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION) { + List clauses = new ArrayList<>(); + while (regExp.exp1.kind == RegExp.Kind.REGEXP_CONCATENATION) { + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + regExp = regExp.exp1; + } + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + clauses.add(regexpToQuery(fieldName, regExp.exp1)); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (int i = clauses.size() - 1; i >= 0; i--) { + Query clause = clauses.get(i); + if (!(clause instanceof MatchAllDocsQuery)) { + builder.add(clause, BooleanClause.Occur.FILTER); + } + } + query = builder.build(); + } else if (regExp.kind == RegExp.Kind.REGEXP_REPEAT_MIN || regExp.kind == RegExp.Kind.REGEXP_REPEAT_MINMAX) { + return regexpToQuery(fieldName, regExp.exp1); + } else { + return new MatchAllDocsQuery(); + } + if (query instanceof BooleanQuery) { + BooleanQuery booleanQuery = (BooleanQuery) query; + if (booleanQuery.clauses().size() == 1) { + return booleanQuery.iterator().next().getQuery(); + } + } + return query; } @Override @@ -711,6 +771,11 @@ public boolean isCacheable(LeafReaderContext leafReaderContext) { } }; } + + // Visible for testing + Predicate getSecondPhaseMatcher() { + return secondPhaseMatcher; + } } @Override diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTest.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java similarity index 88% rename from server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTest.java rename to server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java index b34427f7d4072..2733a5932fd53 100644 --- a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTest.java +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java @@ -17,7 +17,7 @@ import java.util.ArrayList; import java.util.List; -public class WildcardFieldMapperTest extends MapperTestCase { +public class WildcardFieldMapperTests extends MapperTestCase { @Override protected void minimalMapping(XContentBuilder b) throws IOException { @@ -49,8 +49,8 @@ public void testTokenizer() throws IOException { } assertEquals( List.of( - WildcardFieldTypeTest.prefixAnchored("p"), - WildcardFieldTypeTest.prefixAnchored("pi"), + WildcardFieldTypeTests.prefixAnchored("p"), + WildcardFieldTypeTests.prefixAnchored("pi"), "p", "pi", "pic", @@ -65,9 +65,9 @@ public void testTokenizer() throws IOException { "kle", "l", "le", - WildcardFieldTypeTest.suffixAnchored("le"), + WildcardFieldTypeTests.suffixAnchored("le"), "e", - WildcardFieldTypeTest.suffixAnchored("e") + WildcardFieldTypeTests.suffixAnchored("e") ), terms ); diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTest.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java similarity index 50% rename from server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTest.java rename to server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java index 33db9d5ca2c84..f03364054c166 100644 --- a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTest.java +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java @@ -11,12 +11,13 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import java.util.HashSet; import java.util.Set; -public class WildcardFieldTypeTest extends FieldTypeTestCase { +public class WildcardFieldTypeTests extends FieldTypeTestCase { static String prefixAnchored(String val) { return (char) 0 + val; @@ -38,7 +39,13 @@ public void testTermQuery() { for (String term : expectedTerms) { builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); } - assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), ft.termQuery("apple", null)); + Query actual = ft.termQuery("apple", null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), actual); + WildcardFieldMapper.WildcardMatchingQuery actualTermQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualTermQuery.getSecondPhaseMatcher().test("apple")); + assertFalse(actualTermQuery.getSecondPhaseMatcher().test("Apple")); + assertFalse(actualTermQuery.getSecondPhaseMatcher().test("flapple")); + assertFalse(actualTermQuery.getSecondPhaseMatcher().test("apples")); } public void testWildcardQuery() { @@ -94,9 +101,52 @@ public void testMultipleWildcardsInQuery() { builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); } - assertEquals( - new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), pattern), - ft.wildcardQuery(pattern, null, null) + Query actual = ft.wildcardQuery(pattern, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), pattern), actual); + WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqh")); + assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqqh")); + } + + public void testRegexpQuery() { + String pattern = ".*apple.*"; + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + + Set expectedTerms = new HashSet<>(); + expectedTerms.add("app"); + expectedTerms.add("ppl"); + expectedTerms.add("ple"); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + + Query actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual); + WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apple_foo")); + assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apply_foo")); + + pattern = "ab(zz|cd|ef.*)(hi|jk)"; + builder = new BooleanQuery.Builder(); + builder.add(new TermQuery(new Term("field", "ab")), BooleanClause.Occur.FILTER); + builder.add( + new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zz")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "cd")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "ef")), BooleanClause.Occur.SHOULD) + .build(), + BooleanClause.Occur.FILTER + ); + builder.add( + new BooleanQuery.Builder().add(new TermQuery(new Term("field", "hi")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "jk")), BooleanClause.Occur.SHOULD) + .build(), + BooleanClause.Occur.FILTER ); + actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual); + actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdjk")); + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abefqwertyhi")); } }