From bb635e5a9efce3afd9f92b3de759fe700f9ce62a Mon Sep 17 00:00:00 2001 From: James Rodewig Date: Mon, 21 Oct 2019 09:43:59 -0400 Subject: [PATCH] [DOCS] Reformat CJK bigram and CJK width token filter docs (#48210) --- .../cjk-bigram-tokenfilter.asciidoc | 184 ++++++++++++++++-- .../cjk-width-tokenfilter.asciidoc | 85 +++++++- 2 files changed, 249 insertions(+), 20 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc index 8ad2403f38e0a..712538ec2786c 100644 --- a/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc @@ -1,18 +1,176 @@ [[analysis-cjk-bigram-tokenfilter]] -=== CJK Bigram Token Filter +=== CJK bigram token filter +++++ +CJK bigram +++++ -The `cjk_bigram` token filter forms bigrams out of the CJK -terms that are generated by the <> -or the `icu_tokenizer` (see {plugins}/analysis-icu-tokenizer.html[`analysis-icu` plugin]). +Forms https://en.wikipedia.org/wiki/Bigram[bigrams] out of CJK (Chinese, +Japanese, and Korean) tokens. -By default, when a CJK character has no adjacent characters to form a bigram, -it is output in unigram form. If you always want to output both unigrams and -bigrams, set the `output_unigrams` flag to `true`. This can be used for a -combined unigram+bigram approach. +This filter is included in {es}'s built-in <>. It uses Lucene's +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/cjk/CJKBigramFilter.html[CJKBigramFilter]. -Bigrams are generated for characters in `han`, `hiragana`, `katakana` and -`hangul`, but bigrams can be disabled for particular scripts with the -`ignored_scripts` parameter. All non-CJK input is passed through unmodified. + +[[analysis-cjk-bigram-tokenfilter-analyze-ex]] +==== Example + +The following <> request demonstrates how the +CJK bigram token filter works. + +[source,console] +-------------------------------------------------- +GET /_analyze +{ + "tokenizer" : "standard", + "filter" : ["cjk_bigram"], + "text" : "東京都は、日本の首都であり" +} +-------------------------------------------------- + +The filter produces the following tokens: + +[source,text] +-------------------------------------------------- +[ 東京, 京都, 都は, 日本, 本の, の首, 首都, 都で, であ, あり ] +-------------------------------------------------- + +///////////////////// +[source,console-result] +-------------------------------------------------- +{ + "tokens" : [ + { + "token" : "東京", + "start_offset" : 0, + "end_offset" : 2, + "type" : "", + "position" : 0 + }, + { + "token" : "京都", + "start_offset" : 1, + "end_offset" : 3, + "type" : "", + "position" : 1 + }, + { + "token" : "都は", + "start_offset" : 2, + "end_offset" : 4, + "type" : "", + "position" : 2 + }, + { + "token" : "日本", + "start_offset" : 5, + "end_offset" : 7, + "type" : "", + "position" : 3 + }, + { + "token" : "本の", + "start_offset" : 6, + "end_offset" : 8, + "type" : "", + "position" : 4 + }, + { + "token" : "の首", + "start_offset" : 7, + "end_offset" : 9, + "type" : "", + "position" : 5 + }, + { + "token" : "首都", + "start_offset" : 8, + "end_offset" : 10, + "type" : "", + "position" : 6 + }, + { + "token" : "都で", + "start_offset" : 9, + "end_offset" : 11, + "type" : "", + "position" : 7 + }, + { + "token" : "であ", + "start_offset" : 10, + "end_offset" : 12, + "type" : "", + "position" : 8 + }, + { + "token" : "あり", + "start_offset" : 11, + "end_offset" : 13, + "type" : "", + "position" : 9 + } + ] +} +-------------------------------------------------- +///////////////////// + +[[analysis-cjk-bigram-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +CJK bigram token filter to configure a new +<>. + +[source,console] +-------------------------------------------------- +PUT /cjk_bigram_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "standard_cjk_bigram" : { + "tokenizer" : "standard", + "filter" : ["cjk_bigram"] + } + } + } + } +} +-------------------------------------------------- + + +[[analysis-cjk-bigram-tokenfilter-configure-parms]] +==== Configurable parameters + +`ignored_scripts`:: ++ +-- +(Optional, array of character scripts) +Array of character scripts for which to disable bigrams. +Possible values: + +* `han` +* `hangul` +* `hiragana` +* `katakana` + +All non-CJK input is passed through unmodified. +-- + +`output_unigrams` +(Optional, boolean) +If `true`, emit tokens in both bigram and +https://en.wikipedia.org/wiki/N-gram[unigram] form. If `false`, a CJK character +is output in unigram form when it has no adjacent characters. Defaults to +`false`. + +[[analysis-cjk-bigram-tokenfilter-customize]] +==== Customize + +To customize the CJK bigram token filter, duplicate it to create the basis +for a new custom token filter. You can modify the filter using its configurable +parameters. [source,console] -------------------------------------------------- @@ -30,9 +188,9 @@ PUT /cjk_bigram_example "han_bigrams_filter" : { "type" : "cjk_bigram", "ignored_scripts": [ + "hangul", "hiragana", - "katakana", - "hangul" + "katakana" ], "output_unigrams" : true } diff --git a/docs/reference/analysis/tokenfilters/cjk-width-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/cjk-width-tokenfilter.asciidoc index 21bde5509a6a1..83b3ba8dee776 100644 --- a/docs/reference/analysis/tokenfilters/cjk-width-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/cjk-width-tokenfilter.asciidoc @@ -1,12 +1,83 @@ [[analysis-cjk-width-tokenfilter]] -=== CJK Width Token Filter +=== CJK width token filter +++++ +CJK width +++++ -The `cjk_width` token filter normalizes CJK width differences: +Normalizes width differences in CJK (Chinese, Japanese, and Korean) characters +as follows: -* Folds fullwidth ASCII variants into the equivalent basic Latin -* Folds halfwidth Katakana variants into the equivalent Kana +* Folds full-width ASCII character variants into the equivalent basic Latin +characters +* Folds half-width Katakana character variants into the equivalent Kana +characters -NOTE: This token filter can be viewed as a subset of NFKC/NFKD -Unicode normalization. See the {plugins}/analysis-icu-normalization-charfilter.html[`analysis-icu` plugin] -for full normalization support. +This filter is included in {es}'s built-in <>. It uses Lucene's +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/cjk/CJKWidthFilter.html[CJKWidthFilter]. +NOTE: This token filter can be viewed as a subset of NFKC/NFKD Unicode +normalization. See the +{plugins}/analysis-icu-normalization-charfilter.html[`analysis-icu` plugin] for +full normalization support. + +[[analysis-cjk-width-tokenfilter-analyze-ex]] +==== Example + +[source,console] +-------------------------------------------------- +GET /_analyze +{ + "tokenizer" : "standard", + "filter" : ["cjk_width"], + "text" : "シーサイドライナー" +} +-------------------------------------------------- + +The filter produces the following token: + +[source,text] +-------------------------------------------------- +シーサイドライナー +-------------------------------------------------- + +///////////////////// +[source,console-result] +-------------------------------------------------- +{ + "tokens" : [ + { + "token" : "シーサイドライナー", + "start_offset" : 0, + "end_offset" : 10, + "type" : "", + "position" : 0 + } + ] +} +-------------------------------------------------- +///////////////////// + +[[analysis-cjk-width-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +CJK width token filter to configure a new +<>. + +[source,console] +-------------------------------------------------- +PUT /cjk_width_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "standard_cjk_width" : { + "tokenizer" : "standard", + "filter" : ["cjk_width"] + } + } + } + } +} +--------------------------------------------------