Skip to content

Commit

Permalink
Add efficient first-phase implementation for regexp queries
Browse files Browse the repository at this point in the history
Following the description in https://swtch.com/~rsc/regexp/regexp4.html,
along with the fact that Lucene gives us a regular expression parse
tree, we *can* implement an efficient first-phase match on regular
expressions without needing to write a lot of code.

Signed-off-by: Michael Froh <froh@amazon.com>
  • Loading branch information
msfroh committed May 30, 2024
1 parent a39ffaf commit f47573d
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.common.lucene.BytesRefs;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.common.regex.Regex;
Expand All @@ -52,6 +53,7 @@
import java.io.IOException;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
Expand All @@ -62,7 +64,6 @@
import java.util.Set;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.regex.Pattern;

import static org.opensearch.index.mapper.KeywordFieldMapper.normalizeValue;

Expand Down Expand Up @@ -500,17 +501,76 @@ public Query regexpQuery(
MultiTermQuery.RewriteMethod method,
QueryShardContext context
) {
// TODO -- Extracting mandatory characters from a regex is not trivial, since entire blocks may be optional.
// It is functionally correct to approximate with MatchAllDocs, but performance won't be good.
return new WildcardMatchingQuery(
name(),
new MatchAllDocsQuery(),
Pattern.compile(value).asMatchPredicate(),
"/" + value + "/",
context,
this
);

RegExp regExp = new RegExp(value, syntaxFlags, matchFlags);
Automaton automaton = regExp.toAutomaton(maxDeterminizedStates);
CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton);

return new WildcardMatchingQuery(name(), regexpToQuery(name(), regExp), s -> {
BytesRef valueBytes = BytesRefs.toBytesRef(s);
return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length);
}, "/" + value + "/", context, this);
}

/**
* Implement the match rules described in <a href="https://swtch.com/~rsc/regexp/regexp4.html">Regular Expression Matching with a Trigram Index</a>.
*
* @param fieldName name of the wildcard field
* @param regExp a parsed node in the {@link RegExp} tree
* @return a query that matches on the known required parts of the given regular expression
*/
private static Query regexpToQuery(String fieldName, RegExp regExp) {
Query query;
if (Objects.requireNonNull(regExp.kind) == RegExp.Kind.REGEXP_UNION) {
List<Query> clauses = new ArrayList<>();
while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) {
clauses.add(regexpToQuery(fieldName, regExp.exp2));
regExp = regExp.exp1;
}
clauses.add(regexpToQuery(fieldName, regExp.exp2));
clauses.add(regexpToQuery(fieldName, regExp.exp1));
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (int i = clauses.size() - 1; i >= 0; i--) {
Query clause = clauses.get(i);
if (clause instanceof MatchAllDocsQuery) {
return clause;
}
builder.add(clause, BooleanClause.Occur.SHOULD);
}
query = builder.build();
} else if (regExp.kind == RegExp.Kind.REGEXP_STRING) {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (String string : getRequiredNGrams("*" + regExp.s + "*")) {
builder.add(new TermQuery(new Term(fieldName, string)), BooleanClause.Occur.FILTER);
}
query = builder.build();
} else if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION) {
List<Query> clauses = new ArrayList<>();
while (regExp.exp1.kind == RegExp.Kind.REGEXP_CONCATENATION) {
clauses.add(regexpToQuery(fieldName, regExp.exp2));
regExp = regExp.exp1;
}
clauses.add(regexpToQuery(fieldName, regExp.exp2));
clauses.add(regexpToQuery(fieldName, regExp.exp1));
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (int i = clauses.size() - 1; i >= 0; i--) {
Query clause = clauses.get(i);
if (!(clause instanceof MatchAllDocsQuery)) {
builder.add(clause, BooleanClause.Occur.FILTER);
}
}
query = builder.build();
} else if (regExp.kind == RegExp.Kind.REGEXP_REPEAT_MIN || regExp.kind == RegExp.Kind.REGEXP_REPEAT_MINMAX) {
return regexpToQuery(fieldName, regExp.exp1);
} else {
return new MatchAllDocsQuery();
}
if (query instanceof BooleanQuery) {
BooleanQuery booleanQuery = (BooleanQuery) query;
if (booleanQuery.clauses().size() == 1) {
return booleanQuery.iterator().next().getQuery();
}
}
return query;
}

@Override
Expand Down Expand Up @@ -711,6 +771,11 @@ public boolean isCacheable(LeafReaderContext leafReaderContext) {
}
};
}

// Visible for testing
Predicate<String> getSecondPhaseMatcher() {
return secondPhaseMatcher;
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import java.util.ArrayList;
import java.util.List;

public class WildcardFieldMapperTest extends MapperTestCase {
public class WildcardFieldMapperTests extends MapperTestCase {

@Override
protected void minimalMapping(XContentBuilder b) throws IOException {
Expand Down Expand Up @@ -49,8 +49,8 @@ public void testTokenizer() throws IOException {
}
assertEquals(
List.of(
WildcardFieldTypeTest.prefixAnchored("p"),
WildcardFieldTypeTest.prefixAnchored("pi"),
WildcardFieldTypeTests.prefixAnchored("p"),
WildcardFieldTypeTests.prefixAnchored("pi"),
"p",
"pi",
"pic",
Expand All @@ -65,9 +65,9 @@ public void testTokenizer() throws IOException {
"kle",
"l",
"le",
WildcardFieldTypeTest.suffixAnchored("le"),
WildcardFieldTypeTests.suffixAnchored("le"),
"e",
WildcardFieldTypeTest.suffixAnchored("e")
WildcardFieldTypeTests.suffixAnchored("e")
),
terms
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;

import java.util.HashSet;
import java.util.Set;

public class WildcardFieldTypeTest extends FieldTypeTestCase {
public class WildcardFieldTypeTests extends FieldTypeTestCase {

static String prefixAnchored(String val) {
return (char) 0 + val;
Expand All @@ -38,7 +39,13 @@ public void testTermQuery() {
for (String term : expectedTerms) {
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
}
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), ft.termQuery("apple", null));
Query actual = ft.termQuery("apple", null);
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), actual);
WildcardFieldMapper.WildcardMatchingQuery actualTermQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
assertTrue(actualTermQuery.getSecondPhaseMatcher().test("apple"));
assertFalse(actualTermQuery.getSecondPhaseMatcher().test("Apple"));
assertFalse(actualTermQuery.getSecondPhaseMatcher().test("flapple"));
assertFalse(actualTermQuery.getSecondPhaseMatcher().test("apples"));
}

public void testWildcardQuery() {
Expand Down Expand Up @@ -94,9 +101,52 @@ public void testMultipleWildcardsInQuery() {
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
}

assertEquals(
new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), pattern),
ft.wildcardQuery(pattern, null, null)
Query actual = ft.wildcardQuery(pattern, null, null);
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), pattern), actual);
WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqh"));
assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqqh"));
}

public void testRegexpQuery() {
String pattern = ".*apple.*";
MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field");

Set<String> expectedTerms = new HashSet<>();
expectedTerms.add("app");
expectedTerms.add("ppl");
expectedTerms.add("ple");
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (String term : expectedTerms) {
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
}

Query actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null);
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual);
WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apple_foo"));
assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apply_foo"));

pattern = "ab(zz|cd|ef.*)(hi|jk)";
builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("field", "ab")), BooleanClause.Occur.FILTER);
builder.add(
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zz")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "cd")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "ef")), BooleanClause.Occur.SHOULD)
.build(),
BooleanClause.Occur.FILTER
);
builder.add(
new BooleanQuery.Builder().add(new TermQuery(new Term("field", "hi")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "jk")), BooleanClause.Occur.SHOULD)
.build(),
BooleanClause.Occur.FILTER
);
actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null);
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual);
actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual;
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdjk"));
assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abefqwertyhi"));
}
}

0 comments on commit f47573d

Please sign in to comment.