Skip to content

Commit

Permalink
Move more token filters to analysis-common module
Browse files Browse the repository at this point in the history
The following token filters were moved: stemmer, stemmer_override, kstem, dictionary_decompounder, hyphenation_decompounder, reverse, elision and truncate.

Relates to elastic#23658
  • Loading branch information
martijnvg committed Jun 26, 2017
1 parent 1583f81 commit a34f5fa
Show file tree
Hide file tree
Showing 26 changed files with 475 additions and 155 deletions.
1 change: 0 additions & 1 deletion buildSrc/src/main/resources/checkstyle_suppressions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CustomAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ShingleTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerOverrideTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]cache[/\\]bitset[/\\]BitsetFilterCache.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]codec[/\\]PerFieldMappingPostingFormatCodec.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]engine[/\\]ElasticsearchConcurrentMergeScheduler.java" checks="LineLength" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis.compound;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
Expand All @@ -38,7 +38,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
protected final boolean onlyLongestMatch;
protected final CharArraySet wordList;

public AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);

minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory;
Expand All @@ -75,7 +74,6 @@
import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
Expand All @@ -99,7 +97,6 @@
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
Expand All @@ -116,22 +113,17 @@
import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider;
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
import org.elasticsearch.index.analysis.StopAnalyzerProvider;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin;

import java.io.IOException;
Expand Down Expand Up @@ -201,31 +193,23 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
hunspellService) {
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
tokenFilters.register("kstem", KStemTokenFilterFactory::new);
tokenFilters.register("standard", StandardTokenFilterFactory::new);
tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
tokenFilters.register("min_hash", MinHashTokenFilterFactory::new);
tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
tokenFilters.register("limit", LimitTokenCountFilterFactory::new);
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
tokenFilters.register("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
tokenFilters.register("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
tokenFilters.register("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
tokenFilters.register("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
tokenFilters.register("dutch_stem", DutchStemTokenFilterFactory::new);
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
tokenFilters.register("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);
tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
Expand Down Expand Up @@ -196,18 +196,6 @@ private void testSimpleConfiguration(Settings settings) throws IOException {
// assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4));
// assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
//
// // check dictionary decompounder
// analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
// assertThat(analyzer, instanceOf(CustomAnalyzer.class));
// CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
// assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));

Set<?> wordList = Analysis.getWordSet(null, Version.CURRENT, settings, "index.analysis.filter.dict_dec.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
// MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
}

public void testWordListPath() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,16 @@ public void testAnalyzeWithNoIndex() throws Exception {
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));

analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").addTokenFilter("reverse").get();
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").get();
assertThat(analyzeResponse.getTokens().size(), equalTo(4));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
assertThat(token.getTerm(), equalTo("siht"));
assertThat(token.getTerm(), equalTo("this"));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("si"));
assertThat(token.getTerm(), equalTo("is"));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("a"));
token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("tset"));
assertThat(token.getTerm(), equalTo("test"));

analyzeResponse = client().admin().indices().prepareAnalyze("of course").setTokenizer("standard").addTokenFilter("stop").get();
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,6 @@ public void testStopwordsOnlyPhraseSuggest() throws IOException {
public void testPrefixLength() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
Expand All @@ -458,7 +456,6 @@ public void testPrefixLength() throws IOException {
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("body").field("type", "text").field("analyzer", "body").endObject()
.startObject("body_reverse").field("type", "text").field("analyzer", "reverse").endObject()
.startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject()
.endObject()
.endObject().endObject();
Expand Down Expand Up @@ -486,8 +483,6 @@ public void testPrefixLength() throws IOException {
public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
Expand All @@ -503,10 +498,6 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
field("type", "text").
field("analyzer", "body")
.endObject()
.startObject("body_reverse").
field("type", "text").
field("analyzer", "reverse")
.endObject()
.startObject("bigram").
field("type", "text").
field("analyzer", "bigram")
Expand Down Expand Up @@ -536,7 +527,7 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
"Police sergeant who stops the film",
};
for (String line : strings) {
index("test", "type1", line, "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", line, "body", line, "bigram", line);
}
refresh();

Expand Down Expand Up @@ -576,14 +567,6 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
searchSuggest = searchSuggest( "Arthur, King of the Britons", "simple_phrase", phraseSuggest);
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");

//test reverse suggestions with pre & post filter
phraseSuggest
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
.addCandidateGenerator(candidateGenerator("body_reverse").minWordLength(1).suggestMode("always").preFilter("reverse")
.postFilter("reverse"));
searchSuggest = searchSuggest( "Artur, Ging of the Britons", "simple_phrase", phraseSuggest);
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");

// set all mass to trigrams (not indexed)
phraseSuggest.clearCandidateGenerators()
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
Expand Down Expand Up @@ -633,8 +616,6 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
public void testSizeParam() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
Expand All @@ -652,10 +633,6 @@ public void testSizeParam() throws IOException {
.field("type", "text")
.field("analyzer", "body")
.endObject()
.startObject("body_reverse")
.field("type", "text")
.field("analyzer", "reverse")
.endObject()
.startObject("bigram")
.field("type", "text")
.field("analyzer", "bigram")
Expand All @@ -667,9 +644,9 @@ public void testSizeParam() throws IOException {
ensureGreen();

String line = "xorr the god jewel";
index("test", "type1", "1", "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", "1", "body", line, "bigram", line);
line = "I got it this time";
index("test", "type1", "2", "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", "2", "body", line, "bigram", line);
refresh();

PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@
},
"my":{
"type":"myfilter"
},
"dict_dec":{
"type":"dictionary_decompounder",
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
}
},
"analyzer":{
Expand All @@ -43,10 +39,6 @@
"czechAnalyzerWithStemmer":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "stop", "czech_stem"]
},
"decompoundingAnalyzer":{
"tokenizer":"standard",
"filter":["dict_dec"]
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ index :
stopwords : [stop2-1, stop2-2]
my :
type : myfilter
dict_dec :
type : dictionary_decompounder
word_list : [donau, dampf, schiff, spargel, creme, suppe]
analyzer :
standard :
type : standard
Expand All @@ -34,6 +31,3 @@ index :
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]
decompoundingAnalyzer :
tokenizer : standard
filter : [dict_dec]
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("ngram", NGramTokenFilterFactory::new);
filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new);
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("kstem", KStemTokenFilterFactory::new);
filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
filters.put("reverse", ReverseTokenFilterFactory::new);
filters.put("elision", ElisionTokenFilterFactory::new);
filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
return filters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis.compound;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
Expand All @@ -33,7 +33,7 @@
*/
public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {

public DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
}

Expand Down
Loading

0 comments on commit a34f5fa

Please sign in to comment.