Skip to content

Commit

Permalink
Check for deprecations when analyzers are built (#50908)
Browse files Browse the repository at this point in the history
Generally speaking, deprecated analysis components in elasticsearch will issue deprecation
warnings when they are first used. However, this means that no warnings are emitted when
indexes are created with deprecated components, and users have to actually index a document
to see warnings. This makes it much harder to see these warnings and act on them at
appropriate times.

This is worse in the case where components throw exceptions on upgrade. In this case, users
will not be aware of a problem until a document is indexed, instead of at index creation time.

This commit adds a new check that pushes an empty string through all user-defined analyzers
and normalizers when an IndexAnalyzers object is built for each index; deprecation warnings
and exceptions are now emitted when indexes are created or opened.

Fixes #42349
  • Loading branch information
romseygeek committed Jan 14, 2020
1 parent 9c6ffdc commit 8c16725
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,18 @@
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;

public class CommonAnalysisPluginTests extends ESTestCase {

Expand All @@ -45,42 +40,37 @@ public class CommonAnalysisPluginTests extends ESTestCase {
*/
public void testNGramDeprecationWarning() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED,
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
.build();
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.custom_analyzer.filter", "nGram")
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
assertWarnings(
"The [nGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [ngram] instead.");
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
}

assertWarnings("The [nGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [ngram] instead.");
}

/**
* Check that the deprecated name "nGram" throws an error since 7.0.0
*/
public void testNGramDeprecationError() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
.build();
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.custom_analyzer.filter", "nGram")
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
assertEquals(
"The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. Please change the filter"
+ " name to [ngram] instead.",
ex.getMessage());
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
assertEquals("The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
+ "Please change the filter name to [ngram] instead.", e.getMessage());
}
}

Expand All @@ -89,42 +79,36 @@ public void testNGramDeprecationError() throws IOException {
*/
public void testEdgeNGramDeprecationWarning() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
.build();
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.custom_analyzer.filter", "edgeNGram")
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
assertWarnings(
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
}
assertWarnings("The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
}

/**
* Check that the deprecated name "edgeNGram" throws an error for indices created since 7.0.0
*/
public void testEdgeNGramDeprecationError() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
.build();
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, null))
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.custom_analyzer.filter", "edgeNGram")
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> tokenFilterFactory.create(tokenizer));
assertEquals(
"The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. Please change the filter"
+ " name to [edge_ngram] instead.",
ex.getMessage());
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
() -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin));
assertEquals("The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. "
+ "Please change the filter name to [edge_ngram] instead.", ex.getMessage());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.ElasticsearchException;
Expand Down Expand Up @@ -542,6 +543,10 @@ public IndexAnalyzers build(IndexSettings indexSettings,
tokenFilterFactoryFactories, charFilterFactoryFactories);
}

for (Analyzer analyzer : normalizers.values()) {
analyzer.normalize("", ""); // check for deprecations
}

if (!analyzers.containsKey(DEFAULT_ANALYZER_NAME)) {
analyzers.put(DEFAULT_ANALYZER_NAME,
produceAnalyzer(DEFAULT_ANALYZER_NAME,
Expand Down Expand Up @@ -605,6 +610,7 @@ private static NamedAnalyzer produceAnalyzer(String name,
} else {
analyzer = new NamedAnalyzer(name, analyzerFactory.scope(), analyzerF, overridePositionIncrementGap);
}
checkVersions(analyzer);
return analyzer;
}

Expand Down Expand Up @@ -632,4 +638,20 @@ private void processNormalizerFactory(
NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF);
normalizers.put(name, normalizer);
}

// Some analysis components emit deprecation warnings or throw exceptions when used
// with the wrong version of elasticsearch. These exceptions and warnings are
// normally thrown when tokenstreams are constructed, which unless we build a
// tokenstream up-front does not happen until a document is indexed. In order to
// surface these warnings or exceptions as early as possible, we build an empty
// tokenstream and pull it through an Analyzer at construction time.
private static void checkVersions(Analyzer analyzer) {
try (TokenStream ts = analyzer.tokenStream("", "")) {
ts.reset();
while (ts.incrementToken()) {}
ts.end();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
Expand Down Expand Up @@ -109,6 +110,25 @@ public TokenStream create(TokenStream tokenStream) {
}
}

class DeprecatedTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {

DeprecatedTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}

@Override
public TokenStream create(TokenStream tokenStream) {
deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
return tokenStream;
}

@Override
public TokenStream normalize(TokenStream tokenStream) {
deprecationLogger.deprecated("Using deprecated token filter [deprecated]");
return tokenStream;
}
}

class AppendCharFilterFactory extends AbstractCharFilterFactory {

final String suffix;
Expand Down Expand Up @@ -137,7 +157,10 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
filters.put("mock", MockFactory::new);
filters.put("deprecated", DeprecatedTokenFilterFactory::new);
return filters;
}

@Override
Expand Down Expand Up @@ -507,4 +530,28 @@ public void testExceedSetMaxTokenLimit() {
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
+ idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
}

public void testDeprecationWarnings() throws IOException {
AnalyzeAction.Request req = new AnalyzeAction.Request();
req.tokenizer("standard");
req.addTokenFilter("lowercase");
req.addTokenFilter("deprecated");
req.text("test text");

AnalyzeAction.Response analyze =
TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
assertEquals(2, analyze.getTokens().size());
assertWarnings("Using deprecated token filter [deprecated]");

// normalizer
req = new AnalyzeAction.Request();
req.addTokenFilter("lowercase");
req.addTokenFilter("deprecated");
req.text("text");

analyze =
TransportAnalyzeAction.analyze(req, registry, mockIndexService(), maxTokenCount);
assertEquals(1, analyze.getTokens().size());
assertWarnings("Using deprecated token filter [deprecated]");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.elasticsearch.index;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.index.AssertingDirectoryReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
Expand Down Expand Up @@ -442,7 +443,7 @@ public Analyzer get() {
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
throw new AssertionError("should not be here");
return new TokenStreamComponents(new StandardTokenizer());
}

@Override
Expand Down
Loading

0 comments on commit 8c16725

Please sign in to comment.