Skip to content

Commit

Permalink
Make PreConfiguredTokenFilter harder to misuse (#24572)
Browse files Browse the repository at this point in the history
There are now three public static method to build instances of
PreConfiguredTokenFilter and the ctor is private. I chose static
methods instead of constructors because those allow us to change
out the implementation returned if we so desire.

Relates to #23658
  • Loading branch information
nik9000 authored May 11, 2017
1 parent d447b79 commit 65f2717
Show file tree
Hide file tree
Showing 10 changed files with 142 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;

import java.io.IOException;
import java.util.function.BiFunction;
Expand All @@ -36,31 +37,46 @@
* Provides pre-configured, shared {@link TokenFilter}s.
*/
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}

/**
* Create a pre-configured token filter that may vary based on the Lucene version.
*/
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE,
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
}

/**
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
*/
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH,
(tokenStream, version) -> create.apply(tokenStream, version));
}

private final String name;
private final boolean useFilterForMultitermQueries;
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
private final BiFunction<TokenStream, Version, TokenStream> create;

/**
* Standard ctor with all the power.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction<TokenStream, Version, TokenStream> create) {
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
this.name = name;
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
this.cache = PreBuiltCacheFactory.getCache(cache);
this.create = create;
}

/**
* Convenience ctor for token streams that don't vary based on version.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function<TokenStream, TokenStream> create) {
this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input));
// TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?!
}

@Override
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
return getTokenFilterFactory(Version.indexCreated(settings));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,8 @@ static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");

// Add filters available in lucene-core
preConfiguredTokenFilters.register("lowercase",
new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new));
preConfiguredTokenFilters.register("standard",
new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new));
preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.singleton("standard", false, StandardFilter::new));
/* Note that "stop" is available in lucene-core but it's pre-built
* version uses a set of English stop words that are in
* lucene-analyzers-common so "stop" is defined in the analysis-common
Expand All @@ -288,9 +286,12 @@ static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
continue;
default:
if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
}
String name = preBuilt.name().toLowerCase(Locale.ROOT);
preConfiguredTokenFilters.register(name,
new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create));
preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
Expand Down Expand Up @@ -70,20 +69,6 @@ protected boolean isMultiTermAware() {
},

// Extended Token Filters
SNOWBALL(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, "English");
}
},

STEMMER(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new PorterStemFilter(tokenStream);
}
},

ELISION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
Expand Down Expand Up @@ -207,12 +206,11 @@ public void testBuiltInAnalyzersAreCached() throws IOException {

public void testPreConfiguredTokenFiltersAreCached() throws IOException {
AtomicBoolean built = new AtomicBoolean(false);
PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false,
PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> {
PreConfiguredTokenFilter assertsBuiltOnce = PreConfiguredTokenFilter.singleton("asserts_built_once", false, tokenStream -> {
if (false == built.compareAndSet(false, true)) {
fail("Attempted to build the token filter twice when it should have been cached");
}
return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET);
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
});
try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(),
emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
Expand Down Expand Up @@ -113,7 +112,7 @@ public void testIllegalCharFilters() throws IOException {
private static class MockAnalysisPlugin implements AnalysisPlugin {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return singletonList(new PreConfiguredTokenFilter("mock_forbidden", false, CachingStrategy.ONE, MockLowerCaseFilter::new));
return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
Expand All @@ -55,7 +54,7 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return singletonList(new PreConfiguredTokenFilter("mock_other_lowercase", true, CachingStrategy.ONE, MockLowerCaseFilter::new));
return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new));
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.elasticsearch.indices.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
Expand All @@ -28,6 +29,7 @@
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.elasticsearch.Version;
Expand All @@ -43,6 +45,7 @@
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
Expand All @@ -61,17 +64,23 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.either;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;

public class AnalysisModuleTests extends ESTestCase {
private final Settings emptyNodeSettings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

public IndexAnalyzers getIndexAnalyzers(Settings settings) throws IOException {
return getIndexAnalyzers(getNewRegistry(settings), settings);
Expand Down Expand Up @@ -264,6 +273,71 @@ public void testUnderscoreInAnalyzerName() throws IOException {
}
}

/**
* Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version,
* and that do not vary based on version at all.
*/
public void testPluginPreConfiguredTokenFilters() throws IOException {
// Simple token filter that appends text to the term
final class AppendTokenFilter extends TokenFilter {
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
private final char[] appendMe;

protected AppendTokenFilter(TokenStream input, String appendMe) {
super(input);
this.appendMe = appendMe.toCharArray();
}

@Override
public boolean incrementToken() throws IOException {
if (false == input.incrementToken()) {
return false;
}
term.resizeBuffer(term.length() + appendMe.length);
System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length);
term.setLength(term.length() + appendMe.length);
return true;
}
}
boolean noVersionSupportsMultiTerm = randomBoolean();
boolean luceneVersionSupportsMultiTerm = randomBoolean();
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return Arrays.asList(
PreConfiguredTokenFilter.singleton("no_version", noVersionSupportsMultiTerm,
tokenStream -> new AppendTokenFilter(tokenStream, "no_version")),
PreConfiguredTokenFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm,
(tokenStream, luceneVersion) -> new AppendTokenFilter(tokenStream, luceneVersion.toString())),
PreConfiguredTokenFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm,
(tokenStream, esVersion) -> new AppendTokenFilter(tokenStream, esVersion.toString()))
);
}
})).getAnalysisRegistry();

Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
.put("index.analysis.analyzer.no_version.tokenizer", "keyword")
.put("index.analysis.analyzer.no_version.filter", "no_version")
.put("index.analysis.analyzer.lucene_version.tokenizer", "keyword")
.put("index.analysis.analyzer.lucene_version.filter", "lucene_version")
.put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword")
.put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version")
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
.build());
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"});
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion});
assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version});

assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
analyzers.get("no_version").normalize("", "test").utf8ToString());
assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
analyzers.get("lucene_version").normalize("", "test").utf8ToString());
assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""),
analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
}

public void testRegisterHunspellDictionary() throws Exception {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
Expand Down
Loading

0 comments on commit 65f2717

Please sign in to comment.