Skip to content

Commit

Permalink
Expose discard_compound_token option to kuromoji_tokenizer (#57421)
Browse files Browse the repository at this point in the history
This commit exposes the new Lucene option `discard_compound_token` to the Elasticsearch Kuromoji plugin.
  • Loading branch information
johtani authored and jimczi committed Jun 5, 2020
1 parent 0e57528 commit c75c8b6
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 5 deletions.
8 changes: 7 additions & 1 deletion docs/plugins/analysis-kuromoji.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ unknown words. It can be set to:

Extended mode outputs unigrams for unknown words. Example output:

関西, 国際, 空港
関西, 関西国際空港, 国際, 空港
ア, ブ, ラ, カ, ダ, ブ, ラ
--

Expand Down Expand Up @@ -208,6 +208,12 @@ The above `analyze` request returns the following:
}
--------------------------------------------------

`discard_compound_token`::
Whether original compound tokens should be discarded from the output with `search` mode. Defaults to `false`.
Example output with `search` or `extended` mode and this option `true`:

関西, 国際, 空港


[[analysis-kuromoji-baseform]]
==== `kuromoji_baseform` token filter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,24 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
private static final String NBEST_COST = "nbest_cost";
private static final String NBEST_EXAMPLES = "nbest_examples";
private static final String DISCARD_COMPOUND_TOKEN = "discard_compound_token";

private final UserDictionary userDictionary;
private final Mode mode;
private final String nBestExamples;
private final int nBestCost;

private boolean discartPunctuation;
private boolean discardPunctuation;
private boolean discardCompoundToken;

public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, settings, name);
mode = getMode(settings);
userDictionary = getUserDictionary(env, settings);
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
discardPunctuation = settings.getAsBoolean("discard_punctuation", true);
nBestCost = settings.getAsInt(NBEST_COST, -1);
nBestExamples = settings.get(NBEST_EXAMPLES);
discardCompoundToken = settings.getAsBoolean(DISCARD_COMPOUND_TOKEN, false);
}

public static UserDictionary getUserDictionary(Environment env, Settings settings) {
Expand Down Expand Up @@ -108,7 +111,7 @@ public static JapaneseTokenizer.Mode getMode(Settings settings) {

@Override
public Tokenizer create() {
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discardPunctuation, discardCompoundToken, mode);
int nBestCost = this.nBestCost;
if (nBestExamples != null) {
nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,17 @@ public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
}

public void testDiscardCompoundToken() throws Exception {
TestAnalysis analysis = createTestAnalysis();
TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_discard_compound_token");
String source = "株式会社";
String[] expected = new String[] {"株式", "会社"};

Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenizer, expected);
}

private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
Path home = createTempDir();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@
"type": "kuromoji_tokenizer",
"nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
"nbest_cost" : "1000"
},
"kuromoji_discard_compound_token": {
"type": "kuromoji_tokenizer",
"discard_compound_token": true
}
},
"analyzer" : {
Expand All @@ -68,7 +72,7 @@
"tokenizer" : "kuromoji_tokenizer"
}
}

}
}
}

0 comments on commit c75c8b6

Please sign in to comment.