Skip to content

Commit

Permalink
Remove whitespace tokens.
Browse files Browse the repository at this point in the history
  • Loading branch information
Sumanth committed Mar 28, 2017
1 parent 7b21079 commit 02ee8a4
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.index.analysis.KuromojiUserDictionarySyncUtil;

import java.util.Set;
Expand All @@ -18,6 +19,9 @@ public class SprJapaneseAnalyzer extends StopwordAnalyzerBase {
private final Set<String> stoptags;
private volatile UserDictionary userDict;

// # Remove whitespace tokens (part of speech is defined in stoptags.txt - "symbol-space : 記号-空白")
private static final Set<String> DEFAULT_STOP_TAGS = Sets.newHashSet("記号-空白");

public SprJapaneseAnalyzer(UserDictionary userDict, JapaneseTokenizer.Mode mode, CharArraySet stopwords, Set<String> stoptags, Settings settings) {
super(stopwords);
this.userDict = userDict;
Expand Down Expand Up @@ -48,13 +52,7 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
return result1;
}

@Override
public int hashCode() {
return System.identityHashCode(this);
}

@Override
public boolean equals(Object obj) {
return this == obj;
public static Set<String> getDefaultStopTags() {
return DEFAULT_STOP_TAGS;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
import org.apache.lucene.analysis.ja.SprJapaneseAnalyzer;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;

import java.util.Collections;
import java.util.Set;

/**
*/
Expand All @@ -39,7 +40,7 @@ public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, St
super(indexSettings, name, settings);
final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
analyzer = new SprJapaneseAnalyzer(userDictionary, mode, CharArraySet.EMPTY_SET, Collections.emptySet(), env.settings());
analyzer = new SprJapaneseAnalyzer(userDictionary, mode, CharArraySet.EMPTY_SET, SprJapaneseAnalyzer.getDefaultStopTags(), env.settings());
}

@Override
Expand Down

0 comments on commit 02ee8a4

Please sign in to comment.