Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose Japanese completion filter to kuromoji analysis plugin #81858

Merged
merged 4 commits into from
Jan 31, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
extra.put("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
extra.put("ja_stop", JapaneseStopTokenFilterFactory::new);
extra.put("kuromoji_number", KuromojiNumberFilterFactory::new);
extra.put("kuromoji_completion", KuromojiCompletionFilterFactory::new);
return extra;
}

Expand All @@ -47,6 +48,9 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {

@Override
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
return singletonMap("kuromoji", KuromojiAnalyzerProvider::new);
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
extra.put("kuromoji", KuromojiAnalyzerProvider::new);
extra.put("kuromoji_completion", KuromojiCompletionAnalyzerProvider::new);
return extra;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.plugin.analysis.kuromoji;

import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;

public class KuromojiCompletionAnalyzerProvider extends AbstractIndexAnalyzerProvider<JapaneseCompletionAnalyzer> {

private final JapaneseCompletionAnalyzer analyzer;

public KuromojiCompletionAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
final Mode mode = KuromojiCompletionFilterFactory.getMode(settings);
analyzer = new JapaneseCompletionAnalyzer(userDictionary, mode);
}

@Override
public JapaneseCompletionAnalyzer get() {
return analyzer;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.plugin.analysis.kuromoji;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;

public class KuromojiCompletionFilterFactory extends AbstractTokenFilterFactory {

private final Mode mode;

public KuromojiCompletionFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
mode = getMode(settings);
}

public static JapaneseCompletionFilter.Mode getMode(Settings settings) {
JapaneseCompletionFilter.Mode mode = Mode.INDEX;
String modeSetting = settings.get("mode", null);
if (modeSetting != null) {
if ("index".equalsIgnoreCase(modeSetting)) {
mode = JapaneseCompletionFilter.Mode.INDEX;
} else if ("query".equalsIgnoreCase(modeSetting)) {
mode = JapaneseCompletionFilter.Mode.QUERY;
}
}
return mode;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new JapaneseCompletionFilter(tokenStream, mode);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
Expand Down Expand Up @@ -67,10 +68,16 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
filterFactory = analysis.tokenFilter.get("kuromoji_number");
assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

analyzer = indexAnalyzers.get("kuromoji_completion");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

analyzer = indexAnalyzers.get("my_analyzer");
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));
Expand Down Expand Up @@ -225,6 +232,42 @@ public void testJapaneseStopFilterFactory() throws IOException {
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}

public void testCompletionFilterFactory() throws IOException {
// mode=INDEX
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_completion_index");
assertThat(tokenFilter, instanceOf(KuromojiCompletionFilterFactory.class));
String source = "東京都";
String[] expected_tokens = new String[] { "東京", "toukyou", "都", "to" };
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);

// mode=QUERY
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_query");
assertThat(tokenFilter, instanceOf(KuromojiCompletionFilterFactory.class));
source = "サッk";
expected_tokens = new String[] { "サッk", "sakk" };
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
}

public void testCompletionAnalyzer() throws IOException {
// mode=INDEX
TestAnalysis analysis = createTestAnalysis();
Analyzer analyzer = analysis.indexAnalyzers.get("completion_index_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "ソースコード")) {
assertTokenStreamContents(stream, new String[] { "ソース", "soーsu", "コード", "koーdo" });
}

// mode=QUERY
analyzer = analysis.indexAnalyzers.get("completion_query_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "ソースコード")) {
assertTokenStreamContents(stream, new String[] { "ソースコード", "soーsukoーdo" });
}
}

private static TestAnalysis createTestAnalysis() throws IOException {
InputStream empty_dict = KuromojiAnalysisTests.class.getResourceAsStream("empty_user_dict.txt");
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
"ja_stop" : {
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
},
"kuromoji_completion_index" : {
"type": "kuromoji_completion",
"mode": "index"
},
"kuromoji_completion_query" : {
"type": "kuromoji_completion",
"mode": "query"
}
},

Expand Down Expand Up @@ -70,6 +78,14 @@
"my_analyzer" : {
"type" : "custom",
"tokenizer" : "kuromoji_tokenizer"
},
"completion_index_analyzer" : {
"type" : "kuromoji_completion",
"mode" : "index"
},
"completion_query_analyzer" : {
"type" : "kuromoji_completion",
"mode" : "query"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,41 @@
filter: [kuromoji_stemmer]
- length: { tokens: 1 }
- match: { tokens.0.token: サーバ }
---
"Completion analyzer":
- do:
indices.create:
index: kuromoji_completion_sample
body:
settings:
index:
analysis:
analyzer:
completion_index:
type: kuromoji_completion
mode: index
completion_query:
type: kuromoji_completion
mode: query

- do:
indices.analyze:
index: kuromoji_completion_sample
body:
text: ソースコード
analyzer: completion_index
- length: { tokens: 4 }
- match: { tokens.0.token: ソース }
- match: { tokens.1.token: soーsu }
- match: { tokens.2.token: コード }
- match: { tokens.3.token: koーdo }

- do:
indices.analyze:
index: kuromoji_completion_sample
body:
text: ソースコード
analyzer: completion_query
- length: { tokens: 2 }
- match: { tokens.0.token: ソースコード }
- match: { tokens.1.token: soーsukoーdo }