From 8067a31d05caa1af79929aec7269552fe391aeb4 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Tue, 11 Jun 2024 11:08:29 +0200 Subject: [PATCH] Add support for wildcard field type (#13461) This adds support for the "wildcard" field type that supports efficient execution of wildcard, prefix, and regexp queries by matching first against trigrams (or bigrams or individual characters), then post-filtering by evaluating the original field value against the pattern. --------- Signed-off-by: Michael Froh (cherry picked from commit b71e547e938876c2cec9fcc0328ffbd6b4a50db6) --- CHANGELOG.md | 1 + .../search/270_wildcard_fieldtype_queries.yml | 229 +++++ .../index/mapper/KeywordFieldMapper.java | 2 +- .../index/mapper/WildcardFieldMapper.java | 859 ++++++++++++++++++ .../org/opensearch/indices/IndicesModule.java | 2 + .../mapper/WildcardFieldMapperTests.java | 333 +++++++ .../index/mapper/WildcardFieldTypeTests.java | 176 ++++ 7 files changed, 1601 insertions(+), 1 deletion(-) create mode 100644 rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml create mode 100644 server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java create mode 100644 server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java create mode 100644 server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index fb591d0a7fdd5..1f6552ddc20c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - [Remote Store] Upload translog checkpoint as object metadata to translog.tlog([#13637](https://github.com/opensearch-project/OpenSearch/pull/13637)) - [Remote Store] Add dynamic cluster settings to set timeout for segments upload to Remote Store ([#13679](https://github.com/opensearch-project/OpenSearch/pull/13679)) - Add getMetadataFields to MapperService ([#13819](https://github.com/opensearch-project/OpenSearch/pull/13819)) +- Add "wildcard" field type that supports efficient wildcard, prefix, and regexp queries ([#13461](https://github.com/opensearch-project/OpenSearch/pull/13461)) - Allow setting query parameters on requests ([#13776](https://github.com/opensearch-project/OpenSearch/issues/13776)) - [Remote Store] Add support to disable flush based on translog reader count ([#14027](https://github.com/opensearch-project/OpenSearch/pull/14027)) - [Query Insights] Add exporter support for top n queries ([#12982](https://github.com/opensearch-project/OpenSearch/pull/12982)) diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml new file mode 100644 index 0000000000000..05b6b2e5ed712 --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml @@ -0,0 +1,229 @@ +setup: + - skip: + version: " - 2.99.99" + reason: "Added in 2.15, but need to skip pre-3.0 before backport" + + - do: + indices.create: + index: test + body: + mappings: + properties: + my_field: + type: wildcard + fields: + lower: + type: wildcard + normalizer: lowercase + doc_values: + type: wildcard + doc_values: true + + - do: + index: + index: test + id: 1 + body: + my_field: "org.opensearch.transport.NodeDisconnectedException: [node_s0][127.0.0.1:39953][disconnected] disconnected" + - do: + index: + index: test + id: 2 + body: + my_field: "[2024-06-08T06:31:37,443][INFO ][o.o.c.c.Coordinator ] [node_s2] cluster-manager node [{node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true}] failed, restarting discovery" + + - do: + index: + index: test + id: 3 + body: + my_field: "[2024-06-08T06:31:37,451][INFO ][o.o.c.s.ClusterApplierService] [node_s2] cluster-manager node changed {previous [{node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true}], current []}, term: 1, version: 24, reason: becoming candidate: onLeaderFailure" + - do: + index: + index: test + id: 4 + body: + my_field: "[2024-06-08T06:31:37,452][WARN ][o.o.c.NodeConnectionsService] [node_s1] failed to connect to {node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true} (tried [1] times)" + - do: + index: + index: test + id: 5 + body: + my_field: "AbCd" + - do: + index: + index: test + id: 6 + body: + other_field: "test" + - do: + indices.refresh: {} + +--- +"term query matches exact value": + - do: + search: + index: test + body: + query: + term: + my_field: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.doc_values: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + +--- +"term query matches lowercase-normalized value": + - do: + search: + index: test + body: + query: + term: + my_field.lower: "abcd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.lower: "ABCD" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field: "abcd" + - match: { hits.total.value: 0 } + +--- +"wildcard query matches": + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*Node*Exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + +--- +"wildcard query matches lowercase-normalized field": + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*node*exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*NODE*EXCEPTION*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*node*exception*" + - match: { hits.total.value: 0 } + +--- +"prefix query matches": + - do: + search: + index: test + body: + query: + prefix: + my_field: + value: "[2024-06-08T" + - match: { hits.total.value: 3 } + +--- +"regexp query matches": + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*cluster-manager node.*" + - match: { hits.total.value: 2 } + +--- +"regexp query matches lowercase-normalized field": + - do: + search: + index: test + body: + query: + regexp: + my_field.lower: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 2 } + + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 0 } + +--- +"wildcard match-all works": + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*" + - match: { hits.total.value: 5 } +--- +"regexp match-all works": + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*" + - match: { hits.total.value: 5 } diff --git a/server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java index 42b974734e5e7..7f6d9231a37fc 100644 --- a/server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/KeywordFieldMapper.java @@ -703,7 +703,7 @@ protected void parseCreateField(ParseContext context) throws IOException { } } - private static String normalizeValue(NamedAnalyzer normalizer, String field, String value) throws IOException { + static String normalizeValue(NamedAnalyzer normalizer, String field, String value) throws IOException { try (TokenStream ts = normalizer.tokenStream(field, value)) { final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java new file mode 100644 index 0000000000000..4998a822917b4 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java @@ -0,0 +1,859 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.mapper; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; +import org.opensearch.common.lucene.BytesRefs; +import org.opensearch.common.lucene.Lucene; +import org.opensearch.common.regex.Regex; +import org.opensearch.common.unit.Fuzziness; +import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.index.analysis.IndexAnalyzers; +import org.opensearch.index.analysis.NamedAnalyzer; +import org.opensearch.index.fielddata.IndexFieldData; +import org.opensearch.index.fielddata.plain.SortedSetOrdinalsIndexFieldData; +import org.opensearch.index.query.QueryShardContext; +import org.opensearch.search.DocValueFormat; +import org.opensearch.search.aggregations.support.CoreValuesSourceType; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; + +import java.io.IOException; +import java.io.StringReader; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; +import java.util.function.Supplier; + +import static org.opensearch.index.mapper.KeywordFieldMapper.normalizeValue; + +/** + * Mapper for the "wildcard" field type, which supports (relatively) efficient matching by wildcard, prefix, and regexp + * queries. It's not really a "full-text" field type, but rather an "unstructured string" field type. + * + * @opensearch.internal + */ +public class WildcardFieldMapper extends ParametrizedFieldMapper { + private final String nullValue; + private final int ignoreAbove; + private final String normalizerName; + private final boolean hasDocValues; + private final IndexAnalyzers indexAnalyzers; + + /** + * The builder for the field mapper. + * + * @opensearch.internal + */ + public static final class Builder extends ParametrizedFieldMapper.Builder { + + // Copy relevant parameters from KeywordFieldMapper + private final Parameter nullValue = Parameter.stringParam("null_value", false, m -> toType(m).nullValue, null) + .acceptsNull(); + private final Parameter ignoreAbove = Parameter.intParam( + "ignore_above", + true, + m -> toType(m).ignoreAbove, + Integer.MAX_VALUE + ); + private final Parameter normalizer = Parameter.stringParam("normalizer", false, m -> toType(m).normalizerName, "default"); + private final Parameter> meta = Parameter.metaParam(); + private final Parameter hasDocValues = Parameter.docValuesParam(m -> toType(m).hasDocValues, false); + private final IndexAnalyzers indexAnalyzers; + + public Builder(String name, IndexAnalyzers indexAnalyzers) { + super(name); + this.indexAnalyzers = indexAnalyzers; + } + + public Builder(String name) { + this(name, null); + } + + public WildcardFieldMapper.Builder ignoreAbove(int ignoreAbove) { + this.ignoreAbove.setValue(ignoreAbove); + return this; + } + + WildcardFieldMapper.Builder normalizer(String normalizerName) { + this.normalizer.setValue(normalizerName); + return this; + } + + WildcardFieldMapper.Builder nullValue(String nullValue) { + this.nullValue.setValue(nullValue); + return this; + } + + public WildcardFieldMapper.Builder docValues(boolean hasDocValues) { + this.hasDocValues.setValue(hasDocValues); + return this; + } + + @Override + protected List> getParameters() { + return Arrays.asList(nullValue, ignoreAbove, normalizer, hasDocValues, meta); + } + + @Override + public WildcardFieldMapper build(BuilderContext context) { + String normalizerName = normalizer.getValue(); + NamedAnalyzer normalizer = Lucene.KEYWORD_ANALYZER; + if ("default".equals(normalizerName) == false) { + assert indexAnalyzers != null; + normalizer = indexAnalyzers.getNormalizer(normalizerName); + } + + return new WildcardFieldMapper( + name, + new WildcardFieldType(context.path().pathAsText(name), normalizer, this), + multiFieldsBuilder.build(this, context), + copyTo.build(), + this + ); + } + + } + + public static final String CONTENT_TYPE = "wildcard"; + public static final TypeParser PARSER = new TypeParser((n, c) -> new WildcardFieldMapper.Builder(n, c.getIndexAnalyzers())); + + protected WildcardFieldMapper( + String simpleName, + MappedFieldType mappedFieldType, + MultiFields multiFields, + CopyTo copyTo, + Builder builder + ) { + super(simpleName, mappedFieldType, multiFields, copyTo); + this.nullValue = builder.nullValue.getValue(); + this.ignoreAbove = builder.ignoreAbove.getValue(); + this.normalizerName = builder.normalizer.getValue(); + this.hasDocValues = builder.hasDocValues.getValue(); + this.indexAnalyzers = builder.indexAnalyzers; + } + + public int ignoreAbove() { + return ignoreAbove; + } + + private static final FieldType FIELD_TYPE = new FieldType(); + static { + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); + FIELD_TYPE.setTokenized(true); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.setStored(false); + FIELD_TYPE.freeze(); + } + + @Override + protected void parseCreateField(ParseContext context) throws IOException { + String value; + if (context.externalValueSet()) { + value = context.externalValue().toString(); + } else { + XContentParser parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + value = nullValue; + } else { + value = parser.textOrNull(); + } + } + + if (value == null || value.length() > ignoreAbove) { + return; + } + + NamedAnalyzer normalizer = fieldType().normalizer(); + if (normalizer != null) { + value = normalizeValue(normalizer, name(), value); + } + + // convert to utf8 only once before feeding postings/dv/stored fields + final BytesRef binaryValue = new BytesRef(value); + Tokenizer tokenizer = new WildcardFieldTokenizer(); + tokenizer.setReader(new StringReader(value)); + context.doc().add(new Field(fieldType().name(), tokenizer, FIELD_TYPE)); + if (fieldType().hasDocValues()) { + context.doc().add(new SortedSetDocValuesField(fieldType().name(), binaryValue)); + } else { + if (fieldType().hasDocValues() == false) { + createFieldNamesField(context); + } + } + } + + /** + * Tokenizer to emit tokens to support wildcard first-phase matching. + *

+ * Will emit all substrings of length 1,2, and 3, with 0-valued anchors for the prefix/suffix. + *

+ * For example, given the string "lucene", output the following terms: + *

+ * [0, 'l'] + * [0, 'l', 'u'] + * ['l'] + * ['l', 'u'] + * ['l', 'u', 'c'] + * ['u'] + * ['u','c'] + * ['u','c','e'] + * ['c'] + * ['c', 'e'] + * ['c', 'e', 'n'] + * ['e'] + * ['e', 'n'] + * ['e', 'n', 'e'] + * ['n'] + * ['n', 'e'] + * ['n', 'e', 0] + * ['e'] + * ['e', 0] + *

+ * Visible for testing. + */ + static final class WildcardFieldTokenizer extends Tokenizer { + private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); + private final char[] buffer = new char[3]; // Ring buffer for up to 3 chars + private int offset = 0; // Position in the buffer + private int length = 2; // First token is anchor + first char + + @Override + public void reset() throws IOException { + super.reset(); + buffer[0] = 0; + int firstChar = input.read(); + if (firstChar != -1) { + buffer[1] = (char) firstChar; + int secondChar = input.read(); + if (secondChar != -1) { + buffer[2] = (char) secondChar; + } else { + buffer[2] = 0; + } + } else { + buffer[1] = 0; + } + + } + + @Override + public boolean incrementToken() throws IOException { + charTermAttribute.setLength(length); + int numZeroes = 0; + for (int i = 0; i < length; i++) { + char curChar = buffer[(i + offset) % 3]; + if (curChar == 0) { + numZeroes++; + } + charTermAttribute.buffer()[i] = buffer[(i + offset) % 3]; + } + if (numZeroes == 2) { + // Two zeroes usually means we're done. + if (length == 3 && charTermAttribute.buffer()[1] != 0) { + // The only case where we're not done is if the input has exactly 1 character, so the buffer + // contains 0, char, 0. In that case, we return char now, then return char, 0 on the next iteration + charTermAttribute.buffer()[0] = charTermAttribute.buffer()[1]; + charTermAttribute.buffer()[1] = 0; + charTermAttribute.setLength(1); + length = 2; + offset = 1; + return true; + } + return false; + } + if (length == 3) { + // Read the next character, overwriting the current offset + int nextChar = input.read(); + if (nextChar != -1) { + buffer[offset] = (char) nextChar; + } else { + // End of input. Pad with extra 0 to trigger the logic above. + buffer[offset] = 0; + } + offset = (offset + 1) % 3; + length = 1; + } else { + length = length + 1; + } + return true; + } + } + + /** + * Implements the various query types over wildcard fields. + */ + public static final class WildcardFieldType extends StringFieldType { + private final int ignoreAbove; + private final String nullValue; + + public WildcardFieldType(String name) { + this(name, Collections.emptyMap()); + } + + public WildcardFieldType(String name, Map meta) { + super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); + setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); + this.ignoreAbove = Integer.MAX_VALUE; + this.nullValue = null; + } + + public WildcardFieldType(String name, NamedAnalyzer normalizer, Builder builder) { + super(name, true, true, builder.hasDocValues.getValue(), TextSearchInfo.SIMPLE_MATCH_ONLY, builder.meta.getValue()); + setIndexAnalyzer(normalizer); + this.ignoreAbove = builder.ignoreAbove.getValue(); + this.nullValue = builder.nullValue.getValue(); + } + + @Override + public ValueFetcher valueFetcher(QueryShardContext context, SearchLookup searchLookup, String format) { + // Copied from KeywordFieldMapper.KeywordFieldType + if (format != null) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't " + "support formats."); + } + + if (hasDocValues()) { + return new DocValueFetcher(DocValueFormat.RAW, searchLookup.doc().getForField(this)); + } + + return new SourceValueFetcher(name(), context, nullValue) { + @Override + protected String parseSourceValue(Object value) { + String keywordValue = value.toString(); + if (keywordValue.length() > ignoreAbove) { + return null; + } + + NamedAnalyzer normalizer = normalizer(); + if (normalizer == null) { + return keywordValue; + } + + try { + return normalizeValue(normalizer, name(), keywordValue); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + }; + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + NamedAnalyzer normalizer() { + return indexAnalyzer(); + } + + @Override + public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, Supplier searchLookup) { + failIfNoDocValues(); + return new SortedSetOrdinalsIndexFieldData.Builder(name(), CoreValuesSourceType.BYTES); + } + + @Override + public Query fuzzyQuery( + Object value, + Fuzziness fuzziness, + int prefixLength, + int maxExpansions, + boolean transpositions, + QueryShardContext context + ) { + // TODO: Not sure if we can reasonably describe a fuzzy query in terms of n-grams without exploding the cardinality + throw new IllegalArgumentException( + "Can only use fuzzy queries on keyword and text fields - not on [" + name() + "] which is of type [" + typeName() + "]" + ); + } + + @Override + public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) { + return wildcardQuery(value + "*", method, caseInsensitive, context); + } + + @Override + public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) { + NamedAnalyzer normalizer = normalizer(); + if (normalizer != null) { + value = normalizeWildcardPattern(name(), value, normalizer); + } + final String finalValue; + if (caseInsensitive) { + // Use ROOT locale, as it seems to be consistent with AutomatonQueries.toCaseInsensitiveChar. + finalValue = value.toLowerCase(Locale.ROOT); + } else { + finalValue = value; + } + Predicate matchPredicate; + if (value.contains("?")) { + Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), finalValue)); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton); + matchPredicate = s -> { + if (caseInsensitive) { + s = s.toLowerCase(Locale.ROOT); + } + BytesRef valueBytes = BytesRefs.toBytesRef(s); + return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length); + }; + } else { + matchPredicate = s -> { + if (caseInsensitive) { + s = s.toLowerCase(Locale.ROOT); + } + return Regex.simpleMatch(finalValue, s); + }; + } + + Set requiredNGrams = getRequiredNGrams(finalValue); + Query approximation; + if (requiredNGrams.isEmpty()) { + // This only happens when all characters are wildcard characters (* or ?), + // or it's the empty string. + if (value.length() == 0 || value.contains("?")) { + approximation = this.existsQuery(context); + } else { + return existsQuery(context); + } + } else { + approximation = matchAllTermsQuery(name(), requiredNGrams); + } + return new WildcardMatchingQuery(name(), approximation, matchPredicate, value, context, this); + } + + // Package-private for testing + static Set getRequiredNGrams(String value) { + Set terms = new HashSet<>(); + int pos = 0; + String currentSequence = null; + if (!value.startsWith("?") && !value.startsWith("*")) { + // Can add prefix term + currentSequence = getNonWildcardSequence(value, 0); + if (currentSequence.length() == 1) { + terms.add(new String(new char[] { 0, currentSequence.charAt(0) })); + } else { + terms.add(new String(new char[] { 0, currentSequence.charAt(0), currentSequence.charAt(1) })); + } + } else { + pos = findNonWildcardSequence(value, pos); + currentSequence = getNonWildcardSequence(value, pos); + } + while (pos < value.length()) { + boolean isEndOfValue = pos + currentSequence.length() == value.length(); + if (!currentSequence.isEmpty() && currentSequence.length() < 3 && !isEndOfValue && pos > 0) { + // If this is a prefix or suffix of length < 3, then we already have a longer token including the anchor. + terms.add(currentSequence); + } else { + for (int i = 0; i < currentSequence.length() - 2; i++) { + terms.add(currentSequence.substring(i, i + 3)); + } + } + if (isEndOfValue) { + // This is the end of the input. We can attach a suffix anchor. + if (currentSequence.length() == 1) { + terms.add(new String(new char[] { currentSequence.charAt(0), 0 })); + } else { + char a = currentSequence.charAt(currentSequence.length() - 2); + char b = currentSequence.charAt(currentSequence.length() - 1); + terms.add(new String(new char[] { a, b, 0 })); + } + } + pos = findNonWildcardSequence(value, pos + currentSequence.length()); + currentSequence = getNonWildcardSequence(value, pos); + } + return terms; + } + + private static String getNonWildcardSequence(String value, int startFrom) { + for (int i = startFrom; i < value.length(); i++) { + char c = value.charAt(i); + if (c == '?' || c == '*') { + return value.substring(startFrom, i); + } + } + // Made it to the end. No more wildcards. + return value.substring(startFrom); + } + + private static int findNonWildcardSequence(String value, int startFrom) { + for (int i = startFrom; i < value.length(); i++) { + char c = value.charAt(i); + if (c != '?' && c != '*') { + return i; + } + } + return value.length(); + } + + @Override + public Query regexpQuery( + String value, + int syntaxFlags, + int matchFlags, + int maxDeterminizedStates, + MultiTermQuery.RewriteMethod method, + QueryShardContext context + ) { + NamedAnalyzer normalizer = normalizer(); + if (normalizer != null) { + value = normalizer.normalize(name(), value).utf8ToString(); + } + + RegExp regExp = new RegExp(value, syntaxFlags, matchFlags); + Automaton automaton = regExp.toAutomaton(maxDeterminizedStates); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton); + + Predicate regexpPredicate; + if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.ALL) { + return existsQuery(context); + } else if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.NONE) { + return new MatchNoDocsQuery("Regular expression matches nothing"); + } else { + regexpPredicate = s -> { + BytesRef valueBytes = BytesRefs.toBytesRef(s); + return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length); + }; + } + + Query approximation = regexpToQuery(name(), regExp); + if (approximation instanceof MatchAllDocsQuery) { + approximation = existsQuery(context); + } + return new WildcardMatchingQuery(name(), approximation, regexpPredicate, "/" + value + "/", context, this); + } + + /** + * Implement the match rules described in Regular Expression Matching with a Trigram Index. + * + * @param fieldName name of the wildcard field + * @param regExp a parsed node in the {@link RegExp} tree + * @return a query that matches on the known required parts of the given regular expression + */ + private static Query regexpToQuery(String fieldName, RegExp regExp) { + BooleanQuery query; + if (Objects.requireNonNull(regExp.kind) == RegExp.Kind.REGEXP_UNION) { + List clauses = new ArrayList<>(); + while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) { + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + regExp = regExp.exp1; + } + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + clauses.add(regexpToQuery(fieldName, regExp.exp1)); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (int i = clauses.size() - 1; i >= 0; i--) { + Query clause = clauses.get(i); + if (clause instanceof MatchAllDocsQuery) { + return clause; + } + builder.add(clause, BooleanClause.Occur.SHOULD); + } + query = builder.build(); + } else if (regExp.kind == RegExp.Kind.REGEXP_STRING) { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String string : getRequiredNGrams("*" + regExp.s + "*")) { + builder.add(new TermQuery(new Term(fieldName, string)), BooleanClause.Occur.FILTER); + } + query = builder.build(); + } else if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION) { + List clauses = new ArrayList<>(); + while (regExp.exp1.kind == RegExp.Kind.REGEXP_CONCATENATION) { + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + regExp = regExp.exp1; + } + clauses.add(regexpToQuery(fieldName, regExp.exp2)); + clauses.add(regexpToQuery(fieldName, regExp.exp1)); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (int i = clauses.size() - 1; i >= 0; i--) { + Query clause = clauses.get(i); + if (!(clause instanceof MatchAllDocsQuery)) { + builder.add(clause, BooleanClause.Occur.FILTER); + } + } + query = builder.build(); + } else if ((regExp.kind == RegExp.Kind.REGEXP_REPEAT_MIN || regExp.kind == RegExp.Kind.REGEXP_REPEAT_MINMAX) + && regExp.min > 0) { + return regexpToQuery(fieldName, regExp.exp1); + } else { + return new MatchAllDocsQuery(); + } + if (query.clauses().size() == 1) { + return query.iterator().next().getQuery(); + } else if (query.clauses().size() == 0) { + return new MatchAllDocsQuery(); + } + return query; + } + + @Override + public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower, boolean includeUpper, QueryShardContext context) { + throw new UnsupportedOperationException("TODO"); + } + + @Override + public Query termQueryCaseInsensitive(Object value, QueryShardContext context) { + return wildcardQuery(value.toString(), MultiTermQuery.CONSTANT_SCORE_REWRITE, true, context); + } + + @Override + public Query termQuery(Object value, QueryShardContext context) { + return wildcardQuery(BytesRefs.toString(value), MultiTermQuery.CONSTANT_SCORE_REWRITE, false, context); + } + + @Override + public Query termsQuery(List values, QueryShardContext context) { + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + Set expectedValues = new HashSet<>(); + StringBuilder pattern = new StringBuilder(); + for (Object value : values) { + String stringVal = value.toString(); + builder.add(matchAllTermsQuery(name(), getRequiredNGrams(stringVal)), BooleanClause.Occur.SHOULD); + expectedValues.add(stringVal); + if (pattern.length() > 0) { + pattern.append('|'); + } + pattern.append(stringVal); + } + return new WildcardMatchingQuery(name(), builder.build(), expectedValues::contains, pattern.toString(), context, this); + } + + private static BooleanQuery matchAllTermsQuery(String fieldName, Set terms) { + BooleanQuery.Builder matchAllTermsBuilder = new BooleanQuery.Builder(); + for (String term : terms) { + matchAllTermsBuilder.add(new TermQuery(new Term(fieldName, term)), BooleanClause.Occur.FILTER); + } + return matchAllTermsBuilder.build(); + } + } + + /** + * Custom two-phase query type for queries over the wildcard field. The expected behavior is that a first-phase + * query provides the best possible filter over the indexed trigrams, while the second phase matcher eliminates + * false positives by evaluating the true field value. + */ + static class WildcardMatchingQuery extends Query { + private static final long MATCH_COST_ESTIMATE = 1000L; + private final String fieldName; + private final Query firstPhaseQuery; + private final Predicate secondPhaseMatcher; + private final String patternString; // For toString + private final ValueFetcher valueFetcher; + private final SearchLookup searchLookup; + + WildcardMatchingQuery(String fieldName, Query firstPhaseQuery, String patternString) { + this(fieldName, firstPhaseQuery, s -> true, patternString, (QueryShardContext) null, null); + } + + public WildcardMatchingQuery( + String fieldName, + Query firstPhaseQuery, + Predicate secondPhaseMatcher, + String patternString, + QueryShardContext context, + WildcardFieldType fieldType + ) { + this.fieldName = Objects.requireNonNull(fieldName); + this.firstPhaseQuery = Objects.requireNonNull(firstPhaseQuery); + this.secondPhaseMatcher = Objects.requireNonNull(secondPhaseMatcher); + this.patternString = Objects.requireNonNull(patternString); + if (context != null) { + this.searchLookup = context.lookup(); + this.valueFetcher = fieldType.valueFetcher(context, context.lookup(), null); + } else { + this.searchLookup = null; + this.valueFetcher = null; + } + } + + private WildcardMatchingQuery( + String fieldName, + Query firstPhaseQuery, + Predicate secondPhaseMatcher, + String patternString, + ValueFetcher valueFetcher, + SearchLookup searchLookup + ) { + this.fieldName = fieldName; + this.firstPhaseQuery = firstPhaseQuery; + this.secondPhaseMatcher = secondPhaseMatcher; + this.patternString = patternString; + this.valueFetcher = valueFetcher; + this.searchLookup = searchLookup; + } + + @Override + public String toString(String s) { + return "WildcardMatchingQuery(" + fieldName + ":\"" + patternString + "\")"; + } + + @Override + public void visit(QueryVisitor queryVisitor) { + firstPhaseQuery.visit(queryVisitor); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + WildcardMatchingQuery that = (WildcardMatchingQuery) o; + return Objects.equals(fieldName, that.fieldName) + && Objects.equals(firstPhaseQuery, that.firstPhaseQuery) + && Objects.equals(patternString, that.patternString); + } + + @Override + public int hashCode() { + return Objects.hash(fieldName, firstPhaseQuery, patternString); + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + Query rewriteFirstPhase = firstPhaseQuery.rewrite(indexSearcher); + if (rewriteFirstPhase != firstPhaseQuery) { + return new WildcardMatchingQuery( + fieldName, + rewriteFirstPhase, + secondPhaseMatcher, + patternString, + valueFetcher, + searchLookup + ); + } + return this; + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + Weight firstPhaseWeight = firstPhaseQuery.createWeight(searcher, scoreMode, boost); + return new ConstantScoreWeight(this, boost) { + @Override + public Scorer scorer(LeafReaderContext leafReaderContext) throws IOException { + ScorerSupplier supplier = scorerSupplier(leafReaderContext); + if (supplier == null) { + return null; + } + return supplier.get(Long.MAX_VALUE); + } + + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + Weight weight = this; + ScorerSupplier firstPhaseSupplier = firstPhaseWeight.scorerSupplier(context); + if (firstPhaseSupplier == null) { + return null; + } + return new ScorerSupplier() { + @Override + public Scorer get(long leadCost) throws IOException { + Scorer approximateScorer = firstPhaseSupplier.get(leadCost); + DocIdSetIterator approximation = approximateScorer.iterator(); + LeafSearchLookup leafSearchLookup = searchLookup.getLeafSearchLookup(context); + valueFetcher.setNextReader(context); + + TwoPhaseIterator twoPhaseIterator = new TwoPhaseIterator(approximation) { + @Override + public boolean matches() throws IOException { + leafSearchLookup.setDocument(approximation.docID()); + List values = valueFetcher.fetchValues(leafSearchLookup.source()); + for (Object value : values) { + if (secondPhaseMatcher.test(value.toString())) { + return true; + } + } + return false; + } + + @Override + public float matchCost() { + return MATCH_COST_ESTIMATE; + } + }; + return new ConstantScoreScorer(weight, score(), scoreMode, twoPhaseIterator); + } + + @Override + public long cost() { + long firstPhaseCost = firstPhaseSupplier.cost(); + if (firstPhaseCost >= Long.MAX_VALUE / MATCH_COST_ESTIMATE) { + return Long.MAX_VALUE; + } + return firstPhaseCost * MATCH_COST_ESTIMATE; + } + }; + } + + @Override + public boolean isCacheable(LeafReaderContext leafReaderContext) { + return true; + } + }; + } + + // Visible for testing + Predicate getSecondPhaseMatcher() { + return secondPhaseMatcher; + } + } + + @Override + public WildcardFieldType fieldType() { + return (WildcardFieldType) super.fieldType(); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + public ParametrizedFieldMapper.Builder getMergeBuilder() { + return new Builder(simpleName(), indexAnalyzers).init(this); + } + + private static WildcardFieldMapper toType(FieldMapper in) { + return (WildcardFieldMapper) in; + } +} diff --git a/server/src/main/java/org/opensearch/indices/IndicesModule.java b/server/src/main/java/org/opensearch/indices/IndicesModule.java index d2c26157b1963..033b163bb0d67 100644 --- a/server/src/main/java/org/opensearch/indices/IndicesModule.java +++ b/server/src/main/java/org/opensearch/indices/IndicesModule.java @@ -72,6 +72,7 @@ import org.opensearch.index.mapper.SourceFieldMapper; import org.opensearch.index.mapper.TextFieldMapper; import org.opensearch.index.mapper.VersionFieldMapper; +import org.opensearch.index.mapper.WildcardFieldMapper; import org.opensearch.index.remote.RemoteStorePressureService; import org.opensearch.index.seqno.GlobalCheckpointSyncAction; import org.opensearch.index.seqno.RetentionLeaseBackgroundSyncAction; @@ -172,6 +173,7 @@ public static Map getMappers(List mappe mappers.put(FlatObjectFieldMapper.CONTENT_TYPE, FlatObjectFieldMapper.PARSER); mappers.put(ConstantKeywordFieldMapper.CONTENT_TYPE, new ConstantKeywordFieldMapper.TypeParser()); mappers.put(DerivedFieldMapper.CONTENT_TYPE, DerivedFieldMapper.PARSER); + mappers.put(WildcardFieldMapper.CONTENT_TYPE, WildcardFieldMapper.PARSER); for (MapperPlugin mapperPlugin : mapperPlugins) { for (Map.Entry entry : mapperPlugin.getMappers().entrySet()) { diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java new file mode 100644 index 0000000000000..a93f6b2d47e4f --- /dev/null +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java @@ -0,0 +1,333 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.mapper; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.util.BytesRef; +import org.opensearch.Version; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AnalyzerScope; +import org.opensearch.index.analysis.CharFilterFactory; +import org.opensearch.index.analysis.CustomAnalyzer; +import org.opensearch.index.analysis.IndexAnalyzers; +import org.opensearch.index.analysis.LowercaseNormalizer; +import org.opensearch.index.analysis.NamedAnalyzer; +import org.opensearch.index.analysis.TokenFilterFactory; +import org.opensearch.index.analysis.TokenizerFactory; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.singletonMap; +import static org.opensearch.index.mapper.FieldTypeTestCase.fetchSourceValue; + +public class WildcardFieldMapperTests extends MapperTestCase { + + @Override + protected void minimalMapping(XContentBuilder b) throws IOException { + b.field("type", "wildcard"); + } + + @Override + protected void writeFieldValue(XContentBuilder builder) throws IOException { + builder.value("value"); + } + + @Override + protected void registerParameters(ParameterChecker checker) throws IOException { + checker.registerConflictCheck("normalizer", b -> b.field("normalizer", "lowercase")); + checker.registerConflictCheck("doc_values", b -> b.field("doc_values", true)); + checker.registerConflictCheck("null_value", b -> b.field("null_value", "foo")); + checker.registerUpdateCheck(b -> b.field("ignore_above", 256), m -> assertEquals(256, ((WildcardFieldMapper) m).ignoreAbove())); + } + + public void testTokenizer() throws IOException { + List terms = new ArrayList<>(); + try (Tokenizer tokenizer = new WildcardFieldMapper.WildcardFieldTokenizer()) { + tokenizer.setReader(new StringReader("pickle")); + tokenizer.reset(); + CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); + while (tokenizer.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals( + List.of( + WildcardFieldTypeTests.prefixAnchored("p"), + WildcardFieldTypeTests.prefixAnchored("pi"), + "p", + "pi", + "pic", + "i", + "ic", + "ick", + "c", + "ck", + "ckl", + "k", + "kl", + "kle", + "l", + "le", + WildcardFieldTypeTests.suffixAnchored("le"), + "e", + WildcardFieldTypeTests.suffixAnchored("e") + ), + terms + ); + terms.clear(); + try (Tokenizer tokenizer = new WildcardFieldMapper.WildcardFieldTokenizer()) { + tokenizer.setReader(new StringReader("a")); + tokenizer.reset(); + CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); + while (tokenizer.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals(List.of(WildcardFieldTypeTests.prefixAnchored("a"), "a", WildcardFieldTypeTests.suffixAnchored("a")), terms); + } + + public void testEnableDocValues() throws IOException { + DocumentMapper mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("doc_values", true))); + ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234"))); + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType()); + assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType()); + + mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard"))); + doc = mapper.parse(source(b -> b.field("field", "1234"))); + fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType()); + } + + @Override + protected IndexAnalyzers createIndexAnalyzers(IndexSettings indexSettings) { + return new IndexAnalyzers( + singletonMap("default", new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer())), + Map.of( + "lowercase", + new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer()), + "other_lowercase", + new NamedAnalyzer("other_lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer()) + ), + singletonMap( + "lowercase", + new NamedAnalyzer( + "lowercase", + AnalyzerScope.INDEX, + new CustomAnalyzer( + TokenizerFactory.newFactory("lowercase", WhitespaceTokenizer::new), + new CharFilterFactory[0], + new TokenFilterFactory[] { new TokenFilterFactory() { + + @Override + public String name() { + return "lowercase"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new LowerCaseFilter(tokenStream); + } + } } + ) + ) + ) + ); + } + + public void testNormalizer() throws IOException { + DocumentMapper mapper = createDocumentMapper( + fieldMapping(b -> b.field("type", "wildcard").field("normalizer", "lowercase").field("doc_values", true)) + ); + ParsedDocument doc = mapper.parse(source(b -> b.field("field", "AbC"))); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + + assertTrue(fields[0] instanceof Field); + Field textField = (Field) fields[0]; + List terms = new ArrayList<>(); + try (TokenStream tokenStream = textField.tokenStreamValue()) { + tokenStream.reset(); + CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); + while (tokenStream.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals( + List.of( + WildcardFieldTypeTests.prefixAnchored("a"), + WildcardFieldTypeTests.prefixAnchored("ab"), + "a", + "ab", + "abc", + "b", + "bc", + WildcardFieldTypeTests.suffixAnchored("bc"), + "c", + WildcardFieldTypeTests.suffixAnchored("c") + ), + terms + ); + IndexableFieldType fieldType = fields[0].fieldType(); + assertTrue(fieldType.omitNorms()); + assertTrue(fieldType.tokenized()); + assertFalse(fieldType.stored()); + assertEquals(IndexOptions.DOCS, fieldType.indexOptions()); + assertFalse(fieldType.storeTermVectors()); + assertFalse(fieldType.storeTermVectorOffsets()); + assertFalse(fieldType.storeTermVectorPositions()); + assertFalse(fieldType.storeTermVectorPayloads()); + assertEquals(DocValuesType.NONE, fieldType.docValuesType()); + + assertEquals(new BytesRef("abc"), fields[1].binaryValue()); + fieldType = fields[1].fieldType(); + assertEquals(IndexOptions.NONE, fieldType.indexOptions()); + assertEquals(DocValuesType.SORTED_SET, fieldType.docValuesType()); + } + + public void testNullValue() throws IOException { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); + ParsedDocument doc = mapper.parse(source(b -> b.nullField("field"))); + assertArrayEquals(new IndexableField[0], doc.rootDoc().getFields("field")); + + mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("null_value", "uri").field("doc_values", true))); + doc = mapper.parse(source(b -> {})); + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(0, fields.length); + doc = mapper.parse(source(b -> b.nullField("field"))); + fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertTrue(fields[0] instanceof Field); + Field textField = (Field) fields[0]; + List terms = new ArrayList<>(); + try (TokenStream tokenStream = textField.tokenStreamValue()) { + tokenStream.reset(); + CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); + while (tokenStream.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals( + List.of( + WildcardFieldTypeTests.prefixAnchored("u"), + WildcardFieldTypeTests.prefixAnchored("ur"), + "u", + "ur", + "uri", + "r", + "ri", + WildcardFieldTypeTests.suffixAnchored("ri"), + "i", + WildcardFieldTypeTests.suffixAnchored("i") + ), + terms + ); + assertEquals(new BytesRef("uri"), fields[1].binaryValue()); + assertEquals(IndexOptions.NONE, fields[1].fieldType().indexOptions()); + assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType()); + } + + public void testDefaults() throws Exception { + XContentBuilder mapping = fieldMapping(this::minimalMapping); + DocumentMapper mapper = createDocumentMapper(mapping); + assertEquals(mapping.toString(), mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234"))); + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + + assertTrue(fields[0] instanceof Field); + Field textField = (Field) fields[0]; + List terms = new ArrayList<>(); + try (TokenStream tokenStream = textField.tokenStreamValue()) { + tokenStream.reset(); + CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); + while (tokenStream.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals( + List.of( + WildcardFieldTypeTests.prefixAnchored("1"), + WildcardFieldTypeTests.prefixAnchored("12"), + "1", + "12", + "123", + "2", + "23", + "234", + "3", + "34", + WildcardFieldTypeTests.suffixAnchored("34"), + "4", + WildcardFieldTypeTests.suffixAnchored("4") + ), + terms + ); + IndexableFieldType fieldType = fields[0].fieldType(); + assertTrue(fieldType.omitNorms()); + assertTrue(fieldType.tokenized()); + assertFalse(fieldType.stored()); + assertEquals(IndexOptions.DOCS, fieldType.indexOptions()); + assertFalse(fieldType.storeTermVectors()); + assertFalse(fieldType.storeTermVectorOffsets()); + assertFalse(fieldType.storeTermVectorPositions()); + assertFalse(fieldType.storeTermVectorPayloads()); + assertEquals(DocValuesType.NONE, fieldType.docValuesType()); + } + + public void testFetchSourceValue() throws IOException { + Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT.id).build(); + Mapper.BuilderContext context = new Mapper.BuilderContext(settings, new ContentPath()); + + MappedFieldType mapper = new WildcardFieldMapper.Builder("field").build(context).fieldType(); + assertEquals(Collections.singletonList("value"), fetchSourceValue(mapper, "value")); + assertEquals(Collections.singletonList("42"), fetchSourceValue(mapper, 42L)); + assertEquals(Collections.singletonList("true"), fetchSourceValue(mapper, true)); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> fetchSourceValue(mapper, "value", "format")); + assertEquals("Field [field] of type [wildcard] doesn't support formats.", e.getMessage()); + + MappedFieldType ignoreAboveMapper = new WildcardFieldMapper.Builder("field").ignoreAbove(4).build(context).fieldType(); + assertEquals(Collections.emptyList(), fetchSourceValue(ignoreAboveMapper, "value")); + assertEquals(Collections.singletonList("42"), fetchSourceValue(ignoreAboveMapper, 42L)); + assertEquals(Collections.singletonList("true"), fetchSourceValue(ignoreAboveMapper, true)); + + MappedFieldType normalizerMapper = new WildcardFieldMapper.Builder("field", createIndexAnalyzers(null)).normalizer("lowercase") + .build(context) + .fieldType(); + assertEquals(Collections.singletonList("value"), fetchSourceValue(normalizerMapper, "VALUE")); + assertEquals(Collections.singletonList("42"), fetchSourceValue(normalizerMapper, 42L)); + assertEquals(Collections.singletonList("value"), fetchSourceValue(normalizerMapper, "value")); + + MappedFieldType nullValueMapper = new WildcardFieldMapper.Builder("field").nullValue("NULL").build(context).fieldType(); + assertEquals(Collections.singletonList("NULL"), fetchSourceValue(nullValueMapper, null)); + } +} diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java new file mode 100644 index 0000000000000..cd2a23cf94c37 --- /dev/null +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java @@ -0,0 +1,176 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.mapper; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +import java.util.HashSet; +import java.util.Set; + +public class WildcardFieldTypeTests extends FieldTypeTestCase { + + static String prefixAnchored(String val) { + return (char) 0 + val; + } + + static String suffixAnchored(String val) { + return val + (char) 0; + } + + public void testTermQuery() { + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + Set expectedTerms = new HashSet<>(); + expectedTerms.add(prefixAnchored("ap")); + expectedTerms.add("app"); + expectedTerms.add("ppl"); + expectedTerms.add("ple"); + expectedTerms.add(suffixAnchored("le")); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + Query actual = ft.termQuery("apple", null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), actual); + WildcardFieldMapper.WildcardMatchingQuery actualTermQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualTermQuery.getSecondPhaseMatcher().test("apple")); + assertFalse(actualTermQuery.getSecondPhaseMatcher().test("Apple")); + assertFalse(actualTermQuery.getSecondPhaseMatcher().test("flapple")); + assertFalse(actualTermQuery.getSecondPhaseMatcher().test("apples")); + } + + public void testWildcardQuery() { + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + Set expectedTerms = new HashSet<>(); + expectedTerms.add(prefixAnchored("ap")); + expectedTerms.add("app"); + expectedTerms.add("ppl"); + expectedTerms.add("ple"); + expectedTerms.add(suffixAnchored("le")); + + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + + assertEquals( + new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "apple"), + ft.wildcardQuery("apple", null, null) + ); + + expectedTerms.remove(prefixAnchored("ap")); + builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + assertEquals( + new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "*apple"), + ft.wildcardQuery("*apple", null, null) + ); + + expectedTerms.remove(suffixAnchored("le")); + builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + assertEquals( + new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "*apple*"), + ft.wildcardQuery("*apple*", null, null) + ); + } + + public void testMultipleWildcardsInQuery() { + final String pattern = "a?cd*efg?h"; + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + Set expectedTerms = new HashSet<>(); + expectedTerms.add(prefixAnchored("a")); + expectedTerms.add("cd"); + expectedTerms.add("efg"); + expectedTerms.add(suffixAnchored("h")); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + + Query actual = ft.wildcardQuery(pattern, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), pattern), actual); + WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqh")); + assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("abcdzzzefgqqh")); + } + + public void testRegexpQuery() { + String pattern = ".*apple.*"; + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + + Set expectedTerms = new HashSet<>(); + expectedTerms.add("app"); + expectedTerms.add("ppl"); + expectedTerms.add("ple"); + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (String term : expectedTerms) { + builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); + } + + Query actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual); + WildcardFieldMapper.WildcardMatchingQuery actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apple_foo")); + assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apply_foo")); + + pattern = "ab(zz|cd|ef.*)(hi|jk)"; + builder = new BooleanQuery.Builder(); + builder.add(new TermQuery(new Term("field", "ab")), BooleanClause.Occur.FILTER); + builder.add( + new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zz")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "cd")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "ef")), BooleanClause.Occur.SHOULD) + .build(), + BooleanClause.Occur.FILTER + ); + builder.add( + new BooleanQuery.Builder().add(new TermQuery(new Term("field", "hi")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "jk")), BooleanClause.Occur.SHOULD) + .build(), + BooleanClause.Occur.FILTER + ); + actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual); + actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdjk")); + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abefqwertyhi")); + } + + public void testWildcardMatchAll() { + String pattern = "???"; + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + Query actual = ft.wildcardQuery(pattern, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", ft.existsQuery(null), "???"), actual); + + pattern = "*"; + actual = ft.wildcardQuery(pattern, null, null); + assertEquals(ft.existsQuery(null), actual); + } + + public void testRegexpMatchAll() { + // The following matches any string of length exactly 3. We do need to evaluate the predicate. + String pattern = "..."; + MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); + Query actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null); + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", ft.existsQuery(null), "/.../"), actual); + + // The following pattern has a predicate that matches everything. We can just return the field exists query. + pattern = ".*"; + actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null); + assertEquals(ft.existsQuery(null), actual); + } +}