Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove whitespace tokens. #8

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.index.analysis.KuromojiUserDictionarySyncUtil;

import java.util.Set;
Expand All @@ -18,6 +19,9 @@ public class SprJapaneseAnalyzer extends StopwordAnalyzerBase {
private final Set<String> stoptags;
private volatile UserDictionary userDict;

// # Remove whitespace tokens (part of speech is defined in stoptags.txt - "symbol-space : 記号-空白")
private static final Set<String> DEFAULT_STOP_TAGS = Sets.newHashSet("記号-空白");

public SprJapaneseAnalyzer(UserDictionary userDict, JapaneseTokenizer.Mode mode, CharArraySet stopwords, Set<String> stoptags, Settings settings) {
super(stopwords);
this.userDict = userDict;
Expand Down Expand Up @@ -48,13 +52,7 @@ protected TokenStream normalize(String fieldName, TokenStream in) {
return result1;
}

@Override
public int hashCode() {
return System.identityHashCode(this);
}

@Override
public boolean equals(Object obj) {
return this == obj;
public static Set<String> getDefaultStopTags() {
return DEFAULT_STOP_TAGS;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
import org.apache.lucene.analysis.ja.SprJapaneseAnalyzer;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;

import java.util.Collections;
import java.util.Set;

/**
*/
Expand All @@ -39,7 +40,7 @@ public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, St
super(indexSettings, name, settings);
final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
analyzer = new SprJapaneseAnalyzer(userDictionary, mode, CharArraySet.EMPTY_SET, Collections.emptySet(), env.settings());
analyzer = new SprJapaneseAnalyzer(userDictionary, mode, CharArraySet.EMPTY_SET, SprJapaneseAnalyzer.getDefaultStopTags(), env.settings());
}

@Override
Expand Down