Skip to content

Commit 0791f93

Browse files
authored
Add an option to split keyword field on whitespace at query time (#30691)
This change adds an option named `split_queries_on_whitespace` to the `keyword` field type. When set to true full text queries (`match`, `multi_match`, `query_string`, ...) that target the field will split the input on whitespace to build the query terms. Defaults to `false`. Closes #30393
1 parent cea3c28 commit 0791f93

File tree

11 files changed

+227
-27
lines changed

11 files changed

+227
-27
lines changed

docs/reference/mapping/types/keyword.asciidoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,12 @@ The following parameters are accepted by `keyword` fields:
103103
How to pre-process the keyword prior to indexing. Defaults to `null`,
104104
meaning the keyword is kept as-is.
105105

106+
`split_queries_on_whitespace`::
107+
108+
Whether <<full-text-queries,full text queries>> should split the input on whitespace
109+
when building a query for this field.
110+
Accepts `true` or `false` (default).
111+
106112
NOTE: Indexes imported from 2.x do not support `keyword`. Instead they will
107113
attempt to downgrade `keyword` into `string`. This allows you to merge modern
108114
mappings with legacy mappings. Long lived indexes will have to be recreated

server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexUpgradeService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ public Set<Entry<String, NamedAnalyzer>> entrySet() {
186186
return Collections.emptySet();
187187
}
188188
};
189-
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap, analyzerMap)) {
189+
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap, analyzerMap, analyzerMap)) {
190190
MapperService mapperService = new MapperService(indexSettings, fakeIndexAnalzyers, xContentRegistry, similarityService,
191191
mapperRegistry, () -> null);
192192
mapperService.merge(indexMetaData, MapperService.MergeReason.MAPPING_RECOVERY);

server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.elasticsearch.index.analysis;
2020

2121
import org.apache.lucene.analysis.Analyzer;
22+
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
2223
import org.elasticsearch.core.internal.io.IOUtils;
2324
import org.elasticsearch.ElasticsearchException;
2425
import org.elasticsearch.Version;
@@ -453,13 +454,16 @@ public IndexAnalyzers build(IndexSettings indexSettings,
453454
analyzerProviders = new HashMap<>(analyzerProviders);
454455
Map<String, NamedAnalyzer> analyzers = new HashMap<>();
455456
Map<String, NamedAnalyzer> normalizers = new HashMap<>();
457+
Map<String, NamedAnalyzer> whitespaceNormalizers = new HashMap<>();
456458
for (Map.Entry<String, AnalyzerProvider<?>> entry : analyzerProviders.entrySet()) {
457459
processAnalyzerFactory(indexSettings, entry.getKey(), entry.getValue(), analyzers,
458460
tokenFilterFactoryFactories, charFilterFactoryFactories, tokenizerFactoryFactories);
459461
}
460462
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
461463
processNormalizerFactory(entry.getKey(), entry.getValue(), normalizers,
462-
tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories);
464+
"keyword", tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories);
465+
processNormalizerFactory(entry.getKey(), entry.getValue(), whitespaceNormalizers,
466+
"whitespace", () -> new WhitespaceTokenizer(), tokenFilterFactoryFactories, charFilterFactoryFactories);
463467
}
464468

465469
if (!analyzers.containsKey("default")) {
@@ -489,7 +493,7 @@ public IndexAnalyzers build(IndexSettings indexSettings,
489493
}
490494
}
491495
return new IndexAnalyzers(indexSettings, defaultAnalyzer, defaultSearchAnalyzer, defaultSearchQuoteAnalyzer,
492-
unmodifiableMap(analyzers), unmodifiableMap(normalizers));
496+
unmodifiableMap(analyzers), unmodifiableMap(normalizers), unmodifiableMap(whitespaceNormalizers));
493497
}
494498

495499
private void processAnalyzerFactory(IndexSettings indexSettings,
@@ -545,15 +549,16 @@ private void processNormalizerFactory(
545549
String name,
546550
AnalyzerProvider<?> normalizerFactory,
547551
Map<String, NamedAnalyzer> normalizers,
548-
TokenizerFactory keywordTokenizerFactory,
552+
String tokenizerName,
553+
TokenizerFactory tokenizerFactory,
549554
Map<String, TokenFilterFactory> tokenFilters,
550555
Map<String, CharFilterFactory> charFilters) {
551-
if (keywordTokenizerFactory == null) {
556+
if (tokenizerFactory == null) {
552557
throw new IllegalStateException("keyword tokenizer factory is null, normalizers require analysis-common module");
553558
}
554559

555560
if (normalizerFactory instanceof CustomNormalizerProvider) {
556-
((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters);
561+
((CustomNormalizerProvider) normalizerFactory).build(tokenizerName, tokenizerFactory, charFilters, tokenFilters);
557562
}
558563
Analyzer normalizerF = normalizerFactory.get();
559564
if (normalizerF == null) {

server/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,14 @@ public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvide
3838
private CustomAnalyzer customAnalyzer;
3939

4040
public CustomNormalizerProvider(IndexSettings indexSettings,
41-
String name, Settings settings) {
41+
String name, Settings settings) {
4242
super(indexSettings, name, settings);
4343
this.analyzerSettings = settings;
4444
}
4545

46-
public void build(final TokenizerFactory keywordTokenizerFactory, final Map<String, CharFilterFactory> charFilters,
46+
public void build(final String tokenizerName, final TokenizerFactory tokenizerFactory, final Map<String, CharFilterFactory> charFilters,
4747
final Map<String, TokenFilterFactory> tokenFilters) {
48-
String tokenizerName = analyzerSettings.get("tokenizer");
49-
if (tokenizerName != null) {
48+
if (analyzerSettings.get("tokenizer") != null) {
5049
throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer");
5150
}
5251

@@ -82,8 +81,8 @@ public void build(final TokenizerFactory keywordTokenizerFactory, final Map<Stri
8281
}
8382

8483
this.customAnalyzer = new CustomAnalyzer(
85-
"keyword",
86-
keywordTokenizerFactory,
84+
tokenizerName,
85+
tokenizerFactory,
8786
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
8887
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()])
8988
);

server/src/main/java/org/elasticsearch/index/analysis/IndexAnalyzers.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos
4040
private final NamedAnalyzer defaultSearchQuoteAnalyzer;
4141
private final Map<String, NamedAnalyzer> analyzers;
4242
private final Map<String, NamedAnalyzer> normalizers;
43+
private final Map<String, NamedAnalyzer> whitespaceNormalizers;
4344
private final IndexSettings indexSettings;
4445

4546
public IndexAnalyzers(IndexSettings indexSettings, NamedAnalyzer defaultIndexAnalyzer, NamedAnalyzer defaultSearchAnalyzer,
4647
NamedAnalyzer defaultSearchQuoteAnalyzer, Map<String, NamedAnalyzer> analyzers,
47-
Map<String, NamedAnalyzer> normalizers) {
48+
Map<String, NamedAnalyzer> normalizers, Map<String, NamedAnalyzer> whitespaceNormalizers) {
4849
super(indexSettings);
4950
if (defaultIndexAnalyzer.name().equals("default") == false) {
5051
throw new IllegalStateException("default analyzer must have the name [default] but was: [" + defaultIndexAnalyzer.name() + "]");
@@ -54,6 +55,7 @@ public IndexAnalyzers(IndexSettings indexSettings, NamedAnalyzer defaultIndexAna
5455
this.defaultSearchQuoteAnalyzer = defaultSearchQuoteAnalyzer;
5556
this.analyzers = analyzers;
5657
this.normalizers = normalizers;
58+
this.whitespaceNormalizers = whitespaceNormalizers;
5759
this.indexSettings = indexSettings;
5860
}
5961

@@ -71,6 +73,13 @@ public NamedAnalyzer getNormalizer(String name) {
7173
return normalizers.get(name);
7274
}
7375

76+
/**
77+
* Returns a normalizer that splits on whitespace mapped to the given name or <code>null</code> if not present
78+
*/
79+
public NamedAnalyzer getWhitespaceNormalizer(String name) {
80+
return whitespaceNormalizers.get(name);
81+
}
82+
7483
/**
7584
* Returns the default index analyzer for this index
7685
*/

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.index.mapper;
2121

2222
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
2324
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2425
import org.apache.lucene.document.Field;
2526
import org.apache.lucene.document.SortedSetDocValuesField;
@@ -35,6 +36,8 @@
3536
import org.elasticsearch.common.xcontent.XContentBuilder;
3637
import org.elasticsearch.common.xcontent.XContentParser;
3738
import org.elasticsearch.common.xcontent.support.XContentMapValues;
39+
import org.elasticsearch.index.analysis.AnalyzerScope;
40+
import org.elasticsearch.index.analysis.IndexAnalyzers;
3841
import org.elasticsearch.index.analysis.NamedAnalyzer;
3942
import org.elasticsearch.index.fielddata.IndexFieldData;
4043
import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData;
@@ -73,6 +76,8 @@ public static class Builder extends FieldMapper.Builder<Builder, KeywordFieldMap
7376

7477
protected String nullValue = Defaults.NULL_VALUE;
7578
protected int ignoreAbove = Defaults.IGNORE_ABOVE;
79+
private IndexAnalyzers indexAnalyzers;
80+
private String normalizerName;
7681

7782
public Builder(String name) {
7883
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
@@ -106,15 +111,36 @@ public Builder eagerGlobalOrdinals(boolean eagerGlobalOrdinals) {
106111
return builder;
107112
}
108113

109-
public Builder normalizer(NamedAnalyzer normalizer) {
110-
fieldType().setNormalizer(normalizer);
111-
fieldType().setSearchAnalyzer(normalizer);
114+
public Builder splitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) {
115+
fieldType().setSplitQueriesOnWhitespace(splitQueriesOnWhitespace);
116+
return builder;
117+
}
118+
119+
public Builder normalizer(IndexAnalyzers indexAnalyzers, String name) {
120+
this.indexAnalyzers = indexAnalyzers;
121+
this.normalizerName = name;
112122
return builder;
113123
}
114124

115125
@Override
116126
public KeywordFieldMapper build(BuilderContext context) {
117127
setupFieldType(context);
128+
if (normalizerName != null) {
129+
NamedAnalyzer normalizer = indexAnalyzers.getNormalizer(normalizerName);
130+
if (normalizer == null) {
131+
throw new MapperParsingException("normalizer [" + normalizerName + "] not found for field [" + name + "]");
132+
}
133+
fieldType().setNormalizer(normalizer);
134+
final NamedAnalyzer searchAnalyzer;
135+
if (fieldType().splitQueriesOnWhitespace) {
136+
searchAnalyzer = indexAnalyzers.getWhitespaceNormalizer(normalizerName);
137+
} else {
138+
searchAnalyzer = normalizer;
139+
}
140+
fieldType().setSearchAnalyzer(searchAnalyzer);
141+
} else if (fieldType().splitQueriesOnWhitespace) {
142+
fieldType().setSearchAnalyzer(new NamedAnalyzer("whitespace", AnalyzerScope.INDEX, new WhitespaceAnalyzer()));
143+
}
118144
return new KeywordFieldMapper(
119145
name, fieldType, defaultFieldType, ignoreAbove,
120146
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
@@ -147,13 +173,12 @@ public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserCo
147173
iterator.remove();
148174
} else if (propName.equals("normalizer")) {
149175
if (propNode != null) {
150-
NamedAnalyzer normalizer = parserContext.getIndexAnalyzers().getNormalizer(propNode.toString());
151-
if (normalizer == null) {
152-
throw new MapperParsingException("normalizer [" + propNode.toString() + "] not found for field [" + name + "]");
153-
}
154-
builder.normalizer(normalizer);
176+
builder.normalizer(parserContext.getIndexAnalyzers(), propNode.toString());
155177
}
156178
iterator.remove();
179+
} else if (propName.equals("split_queries_on_whitespace")) {
180+
builder.splitQueriesOnWhitespace(XContentMapValues.nodeBooleanValue(propNode, "split_queries_on_whitespace"));
181+
iterator.remove();
157182
}
158183
}
159184
return builder;
@@ -163,6 +188,7 @@ public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserCo
163188
public static final class KeywordFieldType extends StringFieldType {
164189

165190
private NamedAnalyzer normalizer = null;
191+
private boolean splitQueriesOnWhitespace;
166192

167193
public KeywordFieldType() {
168194
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
@@ -172,6 +198,7 @@ public KeywordFieldType() {
172198
protected KeywordFieldType(KeywordFieldType ref) {
173199
super(ref);
174200
this.normalizer = ref.normalizer;
201+
this.splitQueriesOnWhitespace = splitQueriesOnWhitespace;
175202
}
176203

177204
public KeywordFieldType clone() {
@@ -183,7 +210,9 @@ public boolean equals(Object o) {
183210
if (super.equals(o) == false) {
184211
return false;
185212
}
186-
return Objects.equals(normalizer, ((KeywordFieldType) o).normalizer);
213+
KeywordFieldType other = (KeywordFieldType) o;
214+
return Objects.equals(normalizer, other.normalizer) &&
215+
splitQueriesOnWhitespace == other.splitQueriesOnWhitespace;
187216
}
188217

189218
@Override
@@ -197,7 +226,7 @@ public void checkCompatibility(MappedFieldType otherFT, List<String> conflicts)
197226

198227
@Override
199228
public int hashCode() {
200-
return 31 * super.hashCode() + Objects.hashCode(normalizer);
229+
return 31 * super.hashCode() + Objects.hash(normalizer, splitQueriesOnWhitespace);
201230
}
202231

203232
@Override
@@ -214,6 +243,15 @@ public void setNormalizer(NamedAnalyzer normalizer) {
214243
this.normalizer = normalizer;
215244
}
216245

246+
public boolean splitQueriesOnWhitespace() {
247+
return splitQueriesOnWhitespace;
248+
}
249+
250+
public void setSplitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) {
251+
checkIfFrozen();
252+
this.splitQueriesOnWhitespace = splitQueriesOnWhitespace;
253+
}
254+
217255
@Override
218256
public Query existsQuery(QueryShardContext context) {
219257
if (hasDocValues()) {
@@ -263,7 +301,8 @@ protected BytesRef indexedValueForSearch(Object value) {
263301
private int ignoreAbove;
264302

265303
protected KeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
266-
int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
304+
int ignoreAbove, Settings indexSettings,
305+
MultiFields multiFields, CopyTo copyTo) {
267306
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
268307
assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
269308
this.ignoreAbove = ignoreAbove;
@@ -366,5 +405,9 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults,
366405
} else if (includeDefaults) {
367406
builder.nullField("normalizer");
368407
}
408+
409+
if (includeDefaults || fieldType().splitQueriesOnWhitespace) {
410+
builder.field("split_queries_on_whitespace", fieldType().splitQueriesOnWhitespace);
411+
}
369412
}
370413
}

server/src/main/java/org/elasticsearch/index/search/MatchQuery.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
import org.elasticsearch.common.lucene.search.Queries;
5353
import org.elasticsearch.common.unit.Fuzziness;
5454
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
55+
import org.elasticsearch.index.mapper.KeywordFieldMapper;
5556
import org.elasticsearch.index.mapper.MappedFieldType;
5657
import org.elasticsearch.index.query.QueryShardContext;
5758
import org.elasticsearch.index.query.support.QueryParsers;
@@ -262,7 +263,8 @@ public Query parse(Type type, String fieldName, Object value) throws IOException
262263
* passing through QueryBuilder.
263264
*/
264265
boolean noForcedAnalyzer = this.analyzer == null;
265-
if (fieldType.tokenized() == false && noForcedAnalyzer) {
266+
if (fieldType.tokenized() == false && noForcedAnalyzer &&
267+
fieldType instanceof KeywordFieldMapper.KeywordFieldType == false) {
266268
return blendTermQuery(new Term(fieldName, value.toString()), fieldType);
267269
}
268270

server/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ public void testBasics() throws IOException {
5454
assertEquals("my_normalizer", normalizer.name());
5555
assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet été-là"});
5656
assertEquals(new BytesRef("cet été-là"), normalizer.normalize("foo", "Cet été-là"));
57+
58+
normalizer = analysis.indexAnalyzers.getWhitespaceNormalizer("my_normalizer");
59+
assertNotNull(normalizer);
60+
assertEquals("my_normalizer", normalizer.name());
61+
assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet", "été-là"});
62+
assertEquals(new BytesRef("cet été-là"), normalizer.normalize("foo", "Cet été-là"));
5763
}
5864

5965
public void testUnknownType() {
@@ -88,7 +94,13 @@ public void testCharFilters() throws IOException {
8894
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
8995
assertNotNull(normalizer);
9096
assertEquals("my_normalizer", normalizer.name());
91-
assertTokenStreamContents(normalizer.tokenStream("foo", "abc"), new String[] {"zbc"});
97+
assertTokenStreamContents(normalizer.tokenStream("foo", "abc acd"), new String[] {"zbc zcd"});
98+
assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc"));
99+
100+
normalizer = analysis.indexAnalyzers.getWhitespaceNormalizer("my_normalizer");
101+
assertNotNull(normalizer);
102+
assertEquals("my_normalizer", normalizer.name());
103+
assertTokenStreamContents(normalizer.tokenStream("foo", "abc acd"), new String[] {"zbc", "zcd"});
92104
assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc"));
93105
}
94106

0 commit comments

Comments
 (0)