Skip to content

Commit bc2289c

Browse files
danmuzijimczi
authored andcommitted
Add nori_number token filter in analysis-nori (#53583)
This change adds the `nori_number` token filter. It also adds a `discard_punctuation` option in nori_tokenizer that should be used in conjunction with the new filter.
1 parent 88c5d52 commit bc2289c

File tree

7 files changed

+222
-1
lines changed

7 files changed

+222
-1
lines changed

docs/plugins/analysis-nori.asciidoc

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ It can be set to:
5454
가곡역 => 가곡역, 가곡, 역
5555
--
5656

57+
`discard_punctuation`::
58+
59+
Whether punctuation should be discarded from the output. Defaults to `true`.
60+
5761
`user_dictionary`::
5862
+
5963
--
@@ -99,6 +103,7 @@ PUT nori_sample
99103
"nori_user_dict": {
100104
"type": "nori_tokenizer",
101105
"decompound_mode": "mixed",
106+
"discard_punctuation": "false",
102107
"user_dictionary": "userdict_ko.txt"
103108
}
104109
},
@@ -434,3 +439,107 @@ Which responds with:
434439
--------------------------------------------------
435440

436441
<1> The Hanja form is replaced by the Hangul translation.
442+
443+
444+
[[analysis-nori-number]]
445+
==== `nori_number` token filter
446+
447+
The `nori_number` token filter normalizes Korean numbers
448+
to regular Arabic decimal numbers in half-width characters.
449+
450+
Korean numbers are often written using a combination of Hangul and Arabic numbers with various kinds punctuation.
451+
For example, 3.2천 means 3200.
452+
This filter does this kind of normalization and allows a search for 3200 to match 3.2천 in text,
453+
but can also be used to make range facets based on the normalized numbers and so on.
454+
455+
[NOTE]
456+
====
457+
Notice that this analyzer uses a token composition scheme and relies on punctuation tokens
458+
being found in the token stream.
459+
Please make sure your `nori_tokenizer` has `discard_punctuation` set to false.
460+
In case punctuation characters, such as U+FF0E(.), is removed from the token stream,
461+
this filter would find input tokens 3 and 2천 and give outputs 3 and 2000 instead of 3200,
462+
which is likely not the intended result.
463+
464+
If you want to remove punctuation characters from your index that are not part of normalized numbers,
465+
add a `stop` token filter with the punctuation you wish to remove after `nori_number` in your analyzer chain.
466+
====
467+
Below are some examples of normalizations this filter supports.
468+
The input is untokenized text and the result is the single term attribute emitted for the input.
469+
470+
- 영영칠 -> 7
471+
- 일영영영 -> 1000
472+
- 삼천2백2십삼 -> 3223
473+
- 조육백만오천일 -> 1000006005001
474+
- 3.2천 -> 3200
475+
- 1.2만345.67 -> 12345.67
476+
- 4,647.100 -> 4647.1
477+
- 15,7 -> 157 (be aware of this weakness)
478+
479+
For example:
480+
481+
[source,console]
482+
--------------------------------------------------
483+
PUT nori_sample
484+
{
485+
"settings": {
486+
"index": {
487+
"analysis": {
488+
"analyzer": {
489+
"my_analyzer": {
490+
"tokenizer": "tokenizer_discard_puncuation_false",
491+
"filter": [
492+
"part_of_speech_stop_sp", "nori_number"
493+
]
494+
}
495+
},
496+
"tokenizer": {
497+
"tokenizer_discard_puncuation_false": {
498+
"type": "nori_tokenizer",
499+
"discard_punctuation": "false"
500+
}
501+
},
502+
"filter": {
503+
"part_of_speech_stop_sp": {
504+
"type": "nori_part_of_speech",
505+
"stoptags": ["SP"]
506+
}
507+
}
508+
}
509+
}
510+
}
511+
}
512+
513+
GET nori_sample/_analyze
514+
{
515+
"analyzer": "my_analyzer",
516+
"text": "십만이천오백과 3.2천"
517+
}
518+
--------------------------------------------------
519+
520+
Which results in:
521+
522+
[source,console-result]
523+
--------------------------------------------------
524+
{
525+
"tokens" : [{
526+
"token" : "102500",
527+
"start_offset" : 0,
528+
"end_offset" : 6,
529+
"type" : "word",
530+
"position" : 0
531+
}, {
532+
"token" : "과",
533+
"start_offset" : 6,
534+
"end_offset" : 7,
535+
"type" : "word",
536+
"position" : 1
537+
}, {
538+
"token" : "3200",
539+
"start_offset" : 8,
540+
"end_offset" : 12,
541+
"type" : "word",
542+
"position" : 2
543+
}]
544+
}
545+
--------------------------------------------------
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.analysis.ko.KoreanNumberFilter;
24+
import org.elasticsearch.common.settings.Settings;
25+
import org.elasticsearch.env.Environment;
26+
import org.elasticsearch.index.IndexSettings;
27+
28+
public class NoriNumberFilterFactory extends AbstractTokenFilterFactory {
29+
30+
public NoriNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
31+
super(indexSettings, name, settings);
32+
}
33+
34+
@Override
35+
public TokenStream create(TokenStream tokenStream) {
36+
return new KoreanNumberFilter(tokenStream);
37+
}
38+
}

plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
3939

4040
private final UserDictionary userDictionary;
4141
private final KoreanTokenizer.DecompoundMode decompoundMode;
42+
private final boolean discardPunctuation;
4243

4344
public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
4445
super(indexSettings, settings, name);
4546
decompoundMode = getMode(settings);
4647
userDictionary = getUserDictionary(env, settings);
48+
discardPunctuation = settings.getAsBoolean("discard_punctuation", true);
4749
}
4850

4951
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
@@ -77,7 +79,8 @@ public static KoreanTokenizer.DecompoundMode getMode(Settings settings) {
7779

7880
@Override
7981
public Tokenizer create() {
80-
return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false);
82+
return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false,
83+
discardPunctuation);
8184
}
8285

8386
}

plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.analysis.Analyzer;
2323
import org.elasticsearch.index.analysis.AnalyzerProvider;
2424
import org.elasticsearch.index.analysis.NoriAnalyzerProvider;
25+
import org.elasticsearch.index.analysis.NoriNumberFilterFactory;
2526
import org.elasticsearch.index.analysis.NoriPartOfSpeechStopFilterFactory;
2627
import org.elasticsearch.index.analysis.NoriReadingFormFilterFactory;
2728
import org.elasticsearch.index.analysis.NoriTokenizerFactory;
@@ -42,6 +43,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
4243
Map<String, AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
4344
extra.put("nori_part_of_speech", NoriPartOfSpeechStopFilterFactory::new);
4445
extra.put("nori_readingform", NoriReadingFormFilterFactory::new);
46+
extra.put("nori_number", NoriNumberFilterFactory::new);
4547
return extra;
4648
}
4749

plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ protected Map<String, Class<?>> getTokenFilters() {
4343
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
4444
filters.put("koreanpartofspeechstop", NoriPartOfSpeechStopFilterFactory.class);
4545
filters.put("koreanreadingform", NoriReadingFormFilterFactory.class);
46+
filters.put("koreannumber", NoriNumberFilterFactory.class);
4647
return filters;
4748
}
4849
}

plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ public void testDefaultsNoriAnalysis() throws IOException {
5454
filterFactory = analysis.tokenFilter.get("nori_readingform");
5555
assertThat(filterFactory, instanceOf(NoriReadingFormFilterFactory.class));
5656

57+
filterFactory = analysis.tokenFilter.get("nori_number");
58+
assertThat(filterFactory, instanceOf(NoriNumberFilterFactory.class));
59+
5760
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
5861
NamedAnalyzer analyzer = indexAnalyzers.get("nori");
5962
assertThat(analyzer.analyzer(), instanceOf(KoreanAnalyzer.class));
@@ -130,6 +133,33 @@ public void testNoriTokenizer() throws Exception {
130133
assertTokenStreamContents(tokenizer, new String[] {"뿌리", "가", "깊", "은", "나무"});
131134
tokenizer.setReader(new StringReader("가늠표"));
132135
assertTokenStreamContents(tokenizer, new String[] {"가늠표", "가늠", "표"});
136+
// discard_punctuation default(true)
137+
tokenizer.setReader(new StringReader("3.2개"));
138+
assertTokenStreamContents(tokenizer, new String[] {"3", "2", "개"});
139+
}
140+
141+
public void testNoriTokenizerDiscardPunctuationOptionTrue() throws Exception {
142+
Settings settings = createDiscardPunctuationOption("true");
143+
TestAnalysis analysis = createTestAnalysis(settings);
144+
Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create();
145+
tokenizer.setReader(new StringReader("3.2개"));
146+
assertTokenStreamContents(tokenizer, new String[] {"3", "2", "개"});
147+
}
148+
149+
public void testNoriTokenizerDiscardPunctuationOptionFalse() throws Exception {
150+
Settings settings = createDiscardPunctuationOption("false");
151+
TestAnalysis analysis = createTestAnalysis(settings);
152+
Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create();
153+
tokenizer.setReader(new StringReader("3.2개"));
154+
assertTokenStreamContents(tokenizer, new String[] {"3", ".", "2", "개"});
155+
}
156+
157+
public void testNoriTokenizerInvalidDiscardPunctuationOption() {
158+
String wrongOption = "wrong";
159+
Settings settings = createDiscardPunctuationOption(wrongOption);
160+
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
161+
assertThat(exc.getMessage(), containsString("Failed to parse value [" + wrongOption
162+
+ "] as only [true] or [false] are allowed."));
133163
}
134164

135165
public void testNoriPartOfSpeech() throws IOException {
@@ -159,6 +189,27 @@ public void testNoriReadingForm() throws IOException {
159189
assertTokenStreamContents(stream, new String[] {"향가"});
160190
}
161191

192+
public void testNoriNumber() throws IOException {
193+
Settings settings = Settings.builder()
194+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
195+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
196+
.put("index.analysis.filter.my_filter.type", "nori_number")
197+
.build();
198+
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisNoriPlugin());
199+
TokenFilterFactory factory = analysis.tokenFilter.get("my_filter");
200+
Tokenizer tokenizer = new KoreanTokenizer();
201+
tokenizer.setReader(new StringReader("오늘 십만이천오백원짜리 와인 구입"));
202+
TokenStream stream = factory.create(tokenizer);
203+
assertTokenStreamContents(stream, new String[] {"오늘", "102500", "원", "짜리", "와인", "구입"});
204+
}
205+
206+
private Settings createDiscardPunctuationOption(String option) {
207+
return Settings.builder()
208+
.put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")
209+
.put("index.analysis.tokenizer.my_tokenizer.discard_punctuation", option)
210+
.build();
211+
}
212+
162213
private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
163214
InputStream dict = NoriAnalysisTests.class.getResourceAsStream("user_dict.txt");
164215
Path home = createTempDir();

plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,20 @@
4646
filter: [nori_readingform]
4747
- length: { tokens: 1 }
4848
- match: { tokens.0.token: 향가 }
49+
---
50+
"Number filter":
51+
- do:
52+
indices.analyze:
53+
body:
54+
text: 십만이천오백과 3.2천
55+
tokenizer:
56+
type: nori_tokenizer
57+
discard_punctuation: false
58+
filter:
59+
- type: nori_part_of_speech
60+
stoptags: ["SP"]
61+
- type: nori_number
62+
- length: { tokens: 3 }
63+
- match: { tokens.0.token: "102500"}
64+
- match: { tokens.1.token: 과}
65+
- match: { tokens.2.token: "3200"}

0 commit comments

Comments
 (0)