From 02ee8a496449889d1fb01185d193a2465acb56c6 Mon Sep 17 00:00:00 2001 From: Sumanth Date: Tue, 28 Mar 2017 18:51:54 +0530 Subject: [PATCH] Remove whitespace tokens. --- .../lucene/analysis/ja/SprJapaneseAnalyzer.java | 14 ++++++-------- .../index/analysis/KuromojiAnalyzerProvider.java | 5 +++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/plugins/analysis-kuromoji/src/main/java/org/apache/lucene/analysis/ja/SprJapaneseAnalyzer.java b/plugins/analysis-kuromoji/src/main/java/org/apache/lucene/analysis/ja/SprJapaneseAnalyzer.java index 8c11277ff691d..bc0a63dd29240 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/apache/lucene/analysis/ja/SprJapaneseAnalyzer.java +++ b/plugins/analysis-kuromoji/src/main/java/org/apache/lucene/analysis/ja/SprJapaneseAnalyzer.java @@ -6,6 +6,7 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.ja.dict.UserDictionary; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.index.analysis.KuromojiUserDictionarySyncUtil; import java.util.Set; @@ -18,6 +19,9 @@ public class SprJapaneseAnalyzer extends StopwordAnalyzerBase { private final Set stoptags; private volatile UserDictionary userDict; + // # Remove whitespace tokens (part of speech is defined in stoptags.txt - "symbol-space : 記号-空白") + private static final Set DEFAULT_STOP_TAGS = Sets.newHashSet("記号-空白"); + public SprJapaneseAnalyzer(UserDictionary userDict, JapaneseTokenizer.Mode mode, CharArraySet stopwords, Set stoptags, Settings settings) { super(stopwords); this.userDict = userDict; @@ -48,13 +52,7 @@ protected TokenStream normalize(String fieldName, TokenStream in) { return result1; } - @Override - public int hashCode() { - return System.identityHashCode(this); - } - - @Override - public boolean equals(Object obj) { - return this == obj; + public static Set getDefaultStopTags() { + return DEFAULT_STOP_TAGS; } } diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java index 788ed978759bb..7688fc8fb72a7 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java @@ -24,10 +24,11 @@ import org.apache.lucene.analysis.ja.SprJapaneseAnalyzer; import org.apache.lucene.analysis.ja.dict.UserDictionary; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; -import java.util.Collections; +import java.util.Set; /** */ @@ -39,7 +40,7 @@ public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, St super(indexSettings, name, settings); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); - analyzer = new SprJapaneseAnalyzer(userDictionary, mode, CharArraySet.EMPTY_SET, Collections.emptySet(), env.settings()); + analyzer = new SprJapaneseAnalyzer(userDictionary, mode, CharArraySet.EMPTY_SET, SprJapaneseAnalyzer.getDefaultStopTags(), env.settings()); } @Override