Skip to content

Commit

Permalink
Check for deprecations when analyzers are built (#50908)
Browse files Browse the repository at this point in the history
Generally speaking, deprecated analysis components in elasticsearch will issue deprecation
warnings when they are first used. However, this means that no warnings are emitted when
indexes are created with deprecated components, and users have to actually index a document
to see warnings. This makes it much harder to see these warnings and act on them at
appropriate times.

This is worse in the case where components throw exceptions on upgrade. In this case, users
will not be aware of a problem until a document is indexed, instead of at index creation time.

This commit adds a new check that pushes an empty string through all user-defined analyzers
and normalizers when an IndexAnalyzers object is built for each index; deprecation warnings
and exceptions are now emitted when indexes are created or opened.

Fixes #42349
  • Loading branch information
romseygeek authored Jan 14, 2020
1 parent b146740 commit 736ed47
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,15 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;

public class CommonAnalysisPluginTests extends ESTestCase {

Expand All @@ -51,13 +46,8 @@ public void testNGramFilterInCustomAnalyzerDeprecationError() throws IOException
.build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings),
settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));

IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
assertEquals("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
+ "Please change the filter name to [ngram] instead.", ex.getMessage());
}
Expand All @@ -69,12 +59,7 @@ public void testNGramFilterInCustomAnalyzerDeprecationError() throws IOException
.putList("index.analysis.analyzer.custom_analyzer.filter", "my_ngram").put("index.analysis.filter.my_ngram.type", "nGram")
.build();
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7),
settingsPre7, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7), settingsPre7, commonAnalysisPlugin);
assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [ngram] instead.");
}
Expand All @@ -95,13 +80,8 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
.build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings),
settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));

IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
assertEquals("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
+ "Please change the filter name to [edge_ngram] instead.", ex.getMessage());
}
Expand All @@ -116,12 +96,8 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
.build();

try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7),
settingsPre7, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settingsPre7),
settingsPre7, commonAnalysisPlugin);
assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.ElasticsearchException;
Expand Down Expand Up @@ -536,6 +537,10 @@ public IndexAnalyzers build(IndexSettings indexSettings,
tokenFilterFactoryFactories, charFilterFactoryFactories);
}

for (Analyzer analyzer : normalizers.values()) {
analyzer.normalize("", ""); // check for deprecations
}

if (!analyzers.containsKey(DEFAULT_ANALYZER_NAME)) {
analyzers.put(DEFAULT_ANALYZER_NAME,
produceAnalyzer(DEFAULT_ANALYZER_NAME,
Expand Down Expand Up @@ -599,6 +604,7 @@ private static NamedAnalyzer produceAnalyzer(String name,
} else {
analyzer = new NamedAnalyzer(name, analyzerFactory.scope(), analyzerF, overridePositionIncrementGap);
}
checkVersions(analyzer);
return analyzer;
}

Expand Down Expand Up @@ -626,4 +632,20 @@ private void processNormalizerFactory(
NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF);
normalizers.put(name, normalizer);
}

// Some analysis components emit deprecation warnings or throw exceptions when used
// with the wrong version of elasticsearch. These exceptions and warnings are
// normally thrown when tokenstreams are constructed, which unless we build a
// tokenstream up-front does not happen until a document is indexed. In order to
// surface these warnings or exceptions as early as possible, we build an empty
// tokenstream and pull it through an Analyzer at construction time.
private static void checkVersions(Analyzer analyzer) {
try (TokenStream ts = analyzer.tokenStream("", "")) {
ts.reset();
while (ts.incrementToken()) {}
ts.end();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
Expand Down Expand Up @@ -108,6 +109,25 @@ public TokenStream create(TokenStream tokenStream) {
}
}

class DeprecatedTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {

DeprecatedTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
return tokenStream;
}

@Override
public TokenStream normalize(TokenStream tokenStream) {
deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
return tokenStream;
}
}

class AppendCharFilterFactory extends AbstractCharFilterFactory {

final String suffix;
Expand Down Expand Up @@ -136,7 +156,7 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
return Map.of("mock", MockFactory::new, "deprecated", DeprecatedTokenFilterFactory::new);
}

@Override
Expand Down Expand Up @@ -492,4 +512,28 @@ public void testExceedSetMaxTokenLimit() {
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
+ idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
}

public void testDeprecationWarnings() throws IOException {
AnalyzeAction.Request req = new AnalyzeAction.Request();
req.tokenizer("standard");
req.addTokenFilter("lowercase");
req.addTokenFilter("deprecated");
req.text("test text");

AnalyzeAction.Response analyze =
TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
assertEquals(2, analyze.getTokens().size());
assertWarnings("Using deprecated token filter [deprecated]");

// normalizer
req = new AnalyzeAction.Request();
req.addTokenFilter("lowercase");
req.addTokenFilter("deprecated");
req.text("text");

analyze =
TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
assertEquals(1, analyze.getTokens().size());
assertWarnings("Using deprecated token filter [deprecated]");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.elasticsearch.index;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.index.AssertingDirectoryReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
Expand Down Expand Up @@ -442,7 +443,7 @@ public Analyzer get() {
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
throw new AssertionError("should not be here");
return new TokenStreamComponents(new StandardTokenizer());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@
package org.elasticsearch.index.analysis;

import com.carrotsearch.randomizedtesting.generators.RandomPicks;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
Expand Down Expand Up @@ -108,19 +109,25 @@ public void testOverrideDefaultAnalyzer() throws IOException {
public void testOverrideDefaultAnalyzerWithoutAnalysisModeAll() throws IOException {
Version version = VersionUtils.randomVersion(random());
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
TokenFilterFactory tokenFilter = new AbstractTokenFilterFactory(IndexSettingsModule.newIndexSettings("index", settings),
"my_filter", Settings.EMPTY) {
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings);
TokenFilterFactory tokenFilter = new AbstractTokenFilterFactory(indexSettings, "my_filter", Settings.EMPTY) {
@Override
public AnalysisMode getAnalysisMode() {
return randomFrom(AnalysisMode.SEARCH_TIME, AnalysisMode.INDEX_TIME);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return null;
return tokenStream;
}
};
Analyzer analyzer = new CustomAnalyzer(null, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
TokenizerFactory tokenizer = new AbstractTokenizerFactory(indexSettings, Settings.EMPTY, "my_tokenizer") {
@Override
public Tokenizer create() {
return new StandardTokenizer();
}
};
Analyzer analyzer = new CustomAnalyzer(tokenizer, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
MapperException ex = expectThrows(MapperException.class,
() -> emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings),
singletonMap("default", new PreBuiltAnalyzerProvider("default", AnalyzerScope.INDEX, analyzer)), emptyMap(),
Expand Down Expand Up @@ -264,4 +271,122 @@ public void testEnsureCloseInvocationProperlyDelegated() throws IOException {
registry.close();
verify(mock).close();
}

public void testDeprecationsAndExceptions() throws IOException {

AnalysisPlugin plugin = new AnalysisPlugin() {

class MockFactory extends AbstractTokenFilterFactory {
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
if (indexSettings.getIndexVersionCreated().equals(Version.CURRENT)) {
deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
}
return tokenStream;
}
}

class ExceptionFactory extends AbstractTokenFilterFactory {

ExceptionFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
if (indexSettings.getIndexVersionCreated().equals(Version.CURRENT)) {
throw new IllegalArgumentException("Cannot use token filter [exception]");
}
return tokenStream;
}
}

class UnusedMockFactory extends AbstractTokenFilterFactory {
UnusedMockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
deprecationLogger.deprecated("Using deprecated token filter [unused]");
return tokenStream;
}
}

class NormalizerFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {

NormalizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
deprecationLogger.deprecated("Using deprecated token filter [deprecated_normalizer]");
return tokenStream;
}

}

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return Map.of("deprecated", MockFactory::new, "unused", UnusedMockFactory::new,
"deprecated_normalizer", NormalizerFactory::new, "exception", ExceptionFactory::new);
}
};

Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.filter.deprecated.type", "deprecated")
.put("index.analysis.analyzer.custom.tokenizer", "standard")
.putList("index.analysis.analyzer.custom.filter", "lowercase", "deprecated")
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

new AnalysisModule(TestEnvironment.newEnvironment(settings),
singletonList(plugin)).getAnalysisRegistry().build(idxSettings);

// We should only get a warning from the token filter that is referenced in settings
assertWarnings("Using deprecated token filter [deprecated]");

indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.getPreviousVersion())
.put("index.analysis.filter.deprecated.type", "deprecated_normalizer")
.putList("index.analysis.normalizer.custom.filter", "lowercase", "deprecated_normalizer")
.put("index.analysis.filter.deprecated.type", "deprecated")
.put("index.analysis.filter.exception.type", "exception")
.put("index.analysis.analyzer.custom.tokenizer", "standard")
// exception will not throw because we're not on Version.CURRENT
.putList("index.analysis.analyzer.custom.filter", "lowercase", "deprecated", "exception")
.build();
idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

new AnalysisModule(TestEnvironment.newEnvironment(settings),
singletonList(plugin)).getAnalysisRegistry().build(idxSettings);

// We should only get a warning from the normalizer, because we're on a version where 'deprecated'
// works fine
assertWarnings("Using deprecated token filter [deprecated_normalizer]");

indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.filter.exception.type", "exception")
.put("index.analysis.analyzer.custom.tokenizer", "standard")
// exception will not throw because we're not on Version.LATEST
.putList("index.analysis.analyzer.custom.filter", "lowercase", "exception")
.build();
IndexSettings exceptionSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);

IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
new AnalysisModule(TestEnvironment.newEnvironment(settings),
singletonList(plugin)).getAnalysisRegistry().build(exceptionSettings);
});
assertEquals("Cannot use token filter [exception]", e.getMessage());

}
}
Loading

0 comments on commit 736ed47

Please sign in to comment.