Skip to content

Commit 5c8fbe6

Browse files
author
Lamine Idjeraoui
committed
Fix "synonym_graph filter fails with word_delimiter_graph when using whitespace or classic tokenizer in synonym_analyzer" bug #18037
add 'analyzersBuiltSoFar' to getChainAwareTokenFilterFactory to build custom analyzers depending on other (already built) analyzers The analyzers are built following the order of precedence specified in the settings Signed-off-by: Lamine Idjeraoui <lidjeraoui@apple.com>
1 parent 4aff917 commit 5c8fbe6

File tree

11 files changed

+243
-38
lines changed

11 files changed

+243
-38
lines changed

modules/analysis-common/src/main/java/org/opensearch/analysis/common/MultiplexerTokenFilterFactory.java

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
package org.opensearch.analysis.common;
3434

35+
import org.apache.lucene.analysis.Analyzer;
3536
import org.apache.lucene.analysis.TokenFilter;
3637
import org.apache.lucene.analysis.TokenStream;
3738
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
@@ -77,7 +78,8 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
7778
TokenizerFactory tokenizer,
7879
List<CharFilterFactory> charFilters,
7980
List<TokenFilterFactory> previousTokenFilters,
80-
Function<String, TokenFilterFactory> allFilters
81+
Function<String, TokenFilterFactory> allFilters,
82+
Function<String, Analyzer> analyzersBuiltSoFar
8183
) {
8284
List<TokenFilterFactory> filters = new ArrayList<>();
8385
if (preserveOriginal) {
@@ -89,15 +91,27 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
8991
String[] parts = Strings.tokenizeToStringArray(filter, ",");
9092
if (parts.length == 1) {
9193
TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
92-
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
94+
factory = factory.getChainAwareTokenFilterFactory(
95+
tokenizer,
96+
charFilters,
97+
previousTokenFilters,
98+
allFilters,
99+
analyzersBuiltSoFar
100+
);
93101
filters.add(factory);
94102
mode = mode.merge(factory.getAnalysisMode());
95103
} else {
96104
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
97105
List<TokenFilterFactory> chain = new ArrayList<>();
98106
for (String subfilter : parts) {
99107
TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
100-
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
108+
factory = factory.getChainAwareTokenFilterFactory(
109+
tokenizer,
110+
charFilters,
111+
existingChain,
112+
allFilters,
113+
analyzersBuiltSoFar
114+
);
101115
chain.add(factory);
102116
existingChain.add(factory);
103117
mode = mode.merge(factory.getAnalysisMode());

modules/analysis-common/src/main/java/org/opensearch/analysis/common/ScriptedConditionTokenFilterFactory.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
package org.opensearch.analysis.common;
3434

35+
import org.apache.lucene.analysis.Analyzer;
3536
import org.apache.lucene.analysis.TokenStream;
3637
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
3738
import org.opensearch.common.settings.Settings;
@@ -84,7 +85,8 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
8485
TokenizerFactory tokenizer,
8586
List<CharFilterFactory> charFilters,
8687
List<TokenFilterFactory> previousTokenFilters,
87-
Function<String, TokenFilterFactory> allFilters
88+
Function<String, TokenFilterFactory> allFilters,
89+
Function<String, Analyzer> analyzersBuiltSoFar
8890
) {
8991
List<TokenFilterFactory> filters = new ArrayList<>();
9092
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
@@ -95,7 +97,7 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
9597
"ScriptedConditionTokenFilter [" + name() + "] refers to undefined token filter [" + filter + "]"
9698
);
9799
}
98-
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
100+
tff = tff.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters, analyzersBuiltSoFar);
99101
filters.add(tff);
100102
existingChain.add(tff);
101103
}

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,10 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
7070
TokenizerFactory tokenizer,
7171
List<CharFilterFactory> charFilters,
7272
List<TokenFilterFactory> previousTokenFilters,
73-
Function<String, TokenFilterFactory> allFilters
73+
Function<String, TokenFilterFactory> allFilters,
74+
Function<String, Analyzer> analyzersBuiltSoFar
7475
) {
75-
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
76+
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar);
7677
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
7778
final String name = name();
7879
return new TokenFilterFactory() {

modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,10 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
112112
TokenizerFactory tokenizer,
113113
List<CharFilterFactory> charFilters,
114114
List<TokenFilterFactory> previousTokenFilters,
115-
Function<String, TokenFilterFactory> allFilters
115+
Function<String, TokenFilterFactory> allFilters,
116+
Function<String, Analyzer> analyzersBuiltSoFar
116117
) {
117-
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
118+
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar);
118119
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
119120
final String name = name();
120121
return new TokenFilterFactory() {
@@ -147,10 +148,15 @@ Analyzer buildSynonymAnalyzer(
147148
TokenizerFactory tokenizer,
148149
List<CharFilterFactory> charFilters,
149150
List<TokenFilterFactory> tokenFilters,
150-
Function<String, TokenFilterFactory> allFilters
151+
Function<String, TokenFilterFactory> allFilters,
152+
Function<String, Analyzer> analyzersBuiltSoFar
151153
) {
152154
if (synonymAnalyzerName != null) {
153-
Analyzer customSynonymAnalyzer;
155+
// first, check settings analyzers
156+
Analyzer customSynonymAnalyzer = analyzersBuiltSoFar.apply(synonymAnalyzerName);
157+
if (customSynonymAnalyzer != null) {
158+
return customSynonymAnalyzer;
159+
}
154160
try {
155161
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
156162
} catch (IOException e) {

modules/analysis-common/src/test/java/org/opensearch/analysis/common/EdgeNGramTokenizerTests.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,9 @@
4949

5050
import java.io.IOException;
5151
import java.io.StringReader;
52+
import java.util.Arrays;
5253
import java.util.Collections;
5354

54-
import static org.hamcrest.Matchers.containsString;
55-
import static org.hamcrest.Matchers.hasToString;
56-
5755
public class EdgeNGramTokenizerTests extends OpenSearchTokenStreamTestCase {
5856

5957
private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
@@ -99,7 +97,14 @@ public void testPreConfiguredTokenizer() throws IOException {
9997
IllegalArgumentException.class,
10098
() -> buildAnalyzers(VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT), "edgeNGram")
10199
);
102-
assertThat(e, hasToString(containsString("The [edgeNGram] tokenizer name was deprecated pre 1.0.")));
100+
101+
boolean found = Arrays.stream(e.getSuppressed())
102+
.map(org.opensearch.ExceptionsHelper::unwrapCause)
103+
.map(Throwable::getMessage)
104+
.findFirst()
105+
.get()
106+
.contains("The [edgeNGram] tokenizer name was deprecated pre 1.0.");
107+
assertTrue("expected deprecation message in suppressed causes", found);
103108
}
104109
}
105110

modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ public void testSynonymWordDeleteByAnalyzer() throws IOException {
120120
fail("fail! due to synonym word deleted by analyzer");
121121
} catch (Exception e) {
122122
assertThat(e, instanceOf(IllegalArgumentException.class));
123-
assertThat(e.getMessage(), startsWith("Failed to build synonyms"));
123+
assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerWithStopSynonymBeforeSynonym]"));
124124
}
125125
}
126126

@@ -141,7 +141,7 @@ public void testExpandSynonymWordDeleteByAnalyzer() throws IOException {
141141
fail("fail! due to synonym word deleted by analyzer");
142142
} catch (Exception e) {
143143
assertThat(e, instanceOf(IllegalArgumentException.class));
144-
assertThat(e.getMessage(), startsWith("Failed to build synonyms"));
144+
assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerExpandWithStopBeforeSynonym]"));
145145
}
146146
}
147147

@@ -269,7 +269,7 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
269269
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
270270
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
271271
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
272-
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
272+
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null);
273273

274274
try (TokenStream ts = analyzer.tokenStream("field", "text")) {
275275
assertThat(ts, instanceOf(KeywordTokenizer.class));
@@ -350,7 +350,7 @@ public void testDisallowedTokenFilters() throws IOException {
350350
IllegalArgumentException e = expectThrows(
351351
IllegalArgumentException.class,
352352
"Expected IllegalArgumentException for factory " + factory,
353-
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)
353+
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null)
354354
);
355355

356356
assertEquals(factory, "Token filter [" + factory + "] cannot be used to parse synonyms", e.getMessage());
@@ -443,4 +443,48 @@ public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
443443
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
444444
}
445445
}
446+
447+
/**
448+
* Test the core dependency resolution issue from GitHub #18037:
449+
* synonym_graph with custom synonym_analyzer should work even when
450+
* the main analyzer contains word_delimiter_graph that would normally
451+
* cause "cannot be used to parse synonyms" error.
452+
*/
453+
public void testSynonymAnalyzerDependencyResolution() throws IOException {
454+
Settings settings = Settings.builder()
455+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
456+
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
457+
458+
// Main analyzer with word_delimiter. order=2
459+
.put("index.analysis.analyzer.main_analyzer.type", "custom")
460+
.put("index.analysis.analyzer.main_analyzer.order", "2")
461+
.put("index.analysis.analyzer.main_analyzer.tokenizer", "standard")
462+
.putList("index.analysis.analyzer.main_analyzer.filter", "lowercase", "test_word_delimiter", "test_synonyms")
463+
464+
// Problematic filter for synonym parsing
465+
.put("index.analysis.filter.test_word_delimiter.type", "word_delimiter_graph")
466+
.put("index.analysis.filter.test_word_delimiter.generate_word_parts", true)
467+
468+
// Custom analyzer dependency. order=1 (built before main_analyzer whose order=2)
469+
.put("index.analysis.analyzer.simple_synonym_analyzer.type", "custom")
470+
.put("index.analysis.analyzer.simple_synonym_analyzer.order", "1")
471+
.put("index.analysis.analyzer.simple_synonym_analyzer.tokenizer", "standard")
472+
473+
// Synonym filter that depends on custom analyzer
474+
.put("index.analysis.filter.test_synonyms.type", "synonym_graph")
475+
.putList("index.analysis.filter.test_synonyms.synonyms", "laptop,notebook")
476+
.put("index.analysis.filter.test_synonyms.synonym_analyzer", "simple_synonym_analyzer")
477+
.build();
478+
479+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test_index", settings);
480+
481+
// Should succeed with the fix (would fail before due to registration order)
482+
IndexAnalyzers analyzers = new AnalysisModule(
483+
TestEnvironment.newEnvironment(settings),
484+
Collections.singletonList(new CommonAnalysisModulePlugin())
485+
).getAnalysisRegistry().build(idxSettings);
486+
487+
assertNotNull("main_analyzer should be created", analyzers.get("main_analyzer"));
488+
assertNotNull("simple_synonym_analyzer should be created", analyzers.get("simple_synonym_analyzer"));
489+
}
446490
}

server/src/main/java/org/opensearch/index/analysis/AnalysisRegistry.java

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
import java.util.ArrayList;
5555
import java.util.Collections;
5656
import java.util.HashMap;
57+
import java.util.LinkedHashMap;
5758
import java.util.List;
5859
import java.util.Locale;
5960
import java.util.Map;
@@ -305,7 +306,7 @@ public NamedAnalyzer buildCustomAnalyzer(
305306
} catch (IOException e) {
306307
throw new UncheckedIOException(e);
307308
}
308-
});
309+
}, null);
309310
tokenFilterFactories.add(tff);
310311
}
311312

@@ -362,7 +363,30 @@ public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings ind
362363

363364
private Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
364365
final Map<String, Settings> analyzersSettings = indexSettings.getSettings().getGroups("index.analysis.analyzer");
365-
return buildMapping(Component.ANALYZER, indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories);
366+
367+
// Some analyzers depend on others that need to be built first
368+
// Sort by 'order', default to 1000
369+
List<Map.Entry<String, Settings>> sortedEntries = analyzersSettings.entrySet().stream().sorted((a, b) -> {
370+
int orderA = a.getValue().getAsInt("order", 100);
371+
int orderB = b.getValue().getAsInt("order", 100);
372+
if (orderA != orderB) {
373+
return Integer.compare(orderA, orderB);
374+
}
375+
return a.getKey().compareTo(b.getKey());
376+
}).collect(Collectors.toList());
377+
378+
Map<String, Settings> sortedAnalyzersSettings = new LinkedHashMap<>();
379+
for (Map.Entry<String, Settings> entry : sortedEntries) {
380+
sortedAnalyzersSettings.put(entry.getKey(), entry.getValue());
381+
}
382+
383+
return buildMapping(
384+
Component.ANALYZER,
385+
indexSettings,
386+
sortedAnalyzersSettings,
387+
analyzers,
388+
prebuiltAnalysis.analyzerProviderFactories
389+
);
366390
}
367391

368392
private Map<String, AnalyzerProvider<?>> buildNormalizerFactories(IndexSettings indexSettings) throws IOException {
@@ -486,7 +510,7 @@ private <T> Map<String, T> buildMapping(
486510
Map<String, ? extends AnalysisModule.AnalysisProvider<T>> defaultInstance
487511
) throws IOException {
488512
Settings defaultSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build();
489-
Map<String, T> factories = new HashMap<>();
513+
Map<String, T> factories = new LinkedHashMap<>(); // keep insertion order
490514
for (Map.Entry<String, Settings> entry : settingsMap.entrySet()) {
491515
String name = entry.getKey();
492516
Settings currentSettings = entry.getValue();
@@ -637,21 +661,27 @@ public IndexAnalyzers build(
637661
Map<String, NamedAnalyzer> analyzers = new HashMap<>();
638662
Map<String, NamedAnalyzer> normalizers = new HashMap<>();
639663
Map<String, NamedAnalyzer> whitespaceNormalizers = new HashMap<>();
664+
Map<String, Exception> buildErrors = new LinkedHashMap<>();
665+
Map<String, Analyzer> analyzersBuiltSoFar = new HashMap<>();
640666
for (Map.Entry<String, AnalyzerProvider<?>> entry : analyzerProviders.entrySet()) {
641-
analyzers.merge(
642-
entry.getKey(),
643-
produceAnalyzer(
667+
try {
668+
NamedAnalyzer namedAnalyzer = produceAnalyzer(
644669
entry.getKey(),
645670
entry.getValue(),
646671
tokenFilterFactoryFactories,
647672
charFilterFactoryFactories,
648-
tokenizerFactoryFactories
649-
),
650-
(k, v) -> {
673+
tokenizerFactoryFactories,
674+
analyzersBuiltSoFar
675+
);
676+
analyzers.merge(entry.getKey(), namedAnalyzer, (k, v) -> {
651677
throw new IllegalStateException("already registered analyzer with name: " + entry.getKey());
652-
}
653-
);
678+
});
679+
analyzersBuiltSoFar.put(entry.getKey(), namedAnalyzer);
680+
} catch (Exception e) {
681+
buildErrors.put(entry.getKey(), e);
682+
}
654683
}
684+
655685
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
656686
processNormalizerFactory(
657687
entry.getKey(),
@@ -707,6 +737,14 @@ public IndexAnalyzers build(
707737
throw new IllegalArgumentException("analyzer name must not start with '_'. got \"" + analyzer.getKey() + "\"");
708738
}
709739
}
740+
741+
if (!buildErrors.isEmpty()) {
742+
IllegalArgumentException aggregated = new IllegalArgumentException("Failed to build analyzers: " + buildErrors.keySet());
743+
buildErrors.forEach(
744+
(name, ex) -> aggregated.addSuppressed(new IllegalArgumentException("[" + name + "] " + ex.getMessage(), ex))
745+
);
746+
throw aggregated;
747+
}
710748
return new IndexAnalyzers(analyzers, normalizers, whitespaceNormalizers);
711749
}
712750

@@ -716,6 +754,17 @@ private static NamedAnalyzer produceAnalyzer(
716754
Map<String, TokenFilterFactory> tokenFilters,
717755
Map<String, CharFilterFactory> charFilters,
718756
Map<String, TokenizerFactory> tokenizers
757+
) {
758+
return produceAnalyzer(name, analyzerFactory, tokenFilters, charFilters, tokenizers, Collections.emptyMap());
759+
}
760+
761+
private static NamedAnalyzer produceAnalyzer(
762+
String name,
763+
AnalyzerProvider<?> analyzerFactory,
764+
Map<String, TokenFilterFactory> tokenFilters,
765+
Map<String, CharFilterFactory> charFilters,
766+
Map<String, TokenizerFactory> tokenizers,
767+
Map<String, Analyzer> analyzersBuiltSoFar
719768
) {
720769
/*
721770
* Lucene defaults positionIncrementGap to 0 in all analyzers but
@@ -725,7 +774,7 @@ private static NamedAnalyzer produceAnalyzer(
725774
*/
726775
int overridePositionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
727776
if (analyzerFactory instanceof CustomAnalyzerProvider) {
728-
((CustomAnalyzerProvider) analyzerFactory).build(tokenizers, charFilters, tokenFilters);
777+
((CustomAnalyzerProvider) analyzerFactory).build(tokenizers, charFilters, tokenFilters, analyzersBuiltSoFar);
729778
/*
730779
* Custom analyzers already default to the correct, version
731780
* dependent positionIncrementGap and the user is be able to

0 commit comments

Comments
 (0)