Document Node Analysis Components API

This functionality was introduced in PR opensearch-project/OpenSearch#10296 Signed-off-by: Lukáš Vlček <lukas.vlcek@aiven.io>
lukas-vlcek · Feb 16, 2024 · 680ba25 · 680ba25
1 parent 9c8180e
commit 680ba25
Show file tree

Hide file tree

Showing 2 changed files with 313 additions and 0 deletions.
diff --git a/_analyzers/index.md b/_analyzers/index.md
@@ -64,6 +64,317 @@ Analyzer | Analysis performed | Analyzer output
 
 If needed, you can combine tokenizers, token filters, and character filters to create a custom analyzer.
 
+With the introduction of OpenSearch `v2.12.1`, you can retrieve a comprehensive list of all available text analysis components by using [Nodes Info]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-info/). This can be helpful when building custom analyzers, especially in cases where you need to recall the component's name or identify the analysis plugin to which the component belongs.
+
+Introduced 2.12.1
+{: .label .label-purple }
+
+```json
+GET /_nodes/analysis_components?pretty=true&filter_path=nodes.*.analysis_components
+```
+{% include copy-curl.html %}
+
+The following is an example response from a node that includes a `common-analysis` module (a module that is present by default):
+
+<details open markdown="block">
+  <summary>
+    Response
+  </summary>
+  {: .text-delta}
+
+```json
+{
+  "nodes" : {
+    "cZidmv5kQbWQN8M8dz9f5g" : {
+      "analysis_components" : {
+        "analyzers" : [
+          "arabic",
+          "armenian",
+          "basque",
+          "bengali",
+          "brazilian",
+          "bulgarian",
+          "catalan",
+          "chinese",
+          "cjk",
+          "czech",
+          "danish",
+          "default",
+          "dutch",
+          "english",
+          "estonian",
+          "fingerprint",
+          "finnish",
+          "french",
+          "galician",
+          "german",
+          "greek",
+          "hindi",
+          "hungarian",
+          "indonesian",
+          "irish",
+          "italian",
+          "keyword",
+          "latvian",
+          "lithuanian",
+          "norwegian",
+          "pattern",
+          "persian",
+          "portuguese",
+          "romanian",
+          "russian",
+          "simple",
+          "snowball",
+          "sorani",
+          "spanish",
+          "standard",
+          "stop",
+          "swedish",
+          "thai",
+          "turkish",
+          "whitespace"
+        ],
+        "tokenizers" : [
+          "PathHierarchy",
+          "char_group",
+          "classic",
+          "edgeNGram",
+          "edge_ngram",
+          "keyword",
+          "letter",
+          "lowercase",
+          "nGram",
+          "ngram",
+          "path_hierarchy",
+          "pattern",
+          "simple_pattern",
+          "simple_pattern_split",
+          "standard",
+          "thai",
+          "uax_url_email",
+          "whitespace"
+        ],
+        "tokenFilters" : [
+          "apostrophe",
+          "arabic_normalization",
+          "arabic_stem",
+          "asciifolding",
+          "bengali_normalization",
+          "brazilian_stem",
+          "cjk_bigram",
+          "cjk_width",
+          "classic",
+          "common_grams",
+          "concatenate_graph",
+          "condition",
+          "czech_stem",
+          "decimal_digit",
+          "delimited_payload",
+          "delimited_term_freq",
+          "dictionary_decompounder",
+          "dutch_stem",
+          "edgeNGram",
+          "edge_ngram",
+          "elision",
+          "fingerprint",
+          "flatten_graph",
+          "french_stem",
+          "german_normalization",
+          "german_stem",
+          "hindi_normalization",
+          "hunspell",
+          "hyphenation_decompounder",
+          "indic_normalization",
+          "keep",
+          "keep_types",
+          "keyword_marker",
+          "kstem",
+          "length",
+          "limit",
+          "lowercase",
+          "min_hash",
+          "multiplexer",
+          "nGram",
+          "ngram",
+          "pattern_capture",
+          "pattern_replace",
+          "persian_normalization",
+          "porter_stem",
+          "predicate_token_filter",
+          "remove_duplicates",
+          "reverse",
+          "russian_stem",
+          "scandinavian_folding",
+          "scandinavian_normalization",
+          "serbian_normalization",
+          "shingle",
+          "snowball",
+          "sorani_normalization",
+          "standard",
+          "stemmer",
+          "stemmer_override",
+          "stop",
+          "synonym",
+          "synonym_graph",
+          "trim",
+          "truncate",
+          "unique",
+          "uppercase",
+          "word_delimiter",
+          "word_delimiter_graph"
+        ],
+        "charFilters" : [
+          "html_strip",
+          "mapping",
+          "pattern_replace"
+        ],
+        "normalizers" : [
+          "lowercase"
+        ],
+        "plugins" : [
+          {
+            "name" : "analysis-common",
+            "classname" : "org.opensearch.analysis.common.CommonAnalysisModulePlugin",
+            "analyzers" : [
+              "arabic",
+              "armenian",
+              "basque",
+              "bengali",
+              "brazilian",
+              "bulgarian",
+              "catalan",
+              "chinese",
+              "cjk",
+              "czech",
+              "danish",
+              "dutch",
+              "english",
+              "estonian",
+              "fingerprint",
+              "finnish",
+              "french",
+              "galician",
+              "german",
+              "greek",
+              "hindi",
+              "hungarian",
+              "indonesian",
+              "irish",
+              "italian",
+              "latvian",
+              "lithuanian",
+              "norwegian",
+              "pattern",
+              "persian",
+              "portuguese",
+              "romanian",
+              "russian",
+              "snowball",
+              "sorani",
+              "spanish",
+              "swedish",
+              "thai",
+              "turkish"
+            ],
+            "tokenizers" : [
+              "PathHierarchy",
+              "char_group",
+              "classic",
+              "edgeNGram",
+              "edge_ngram",
+              "keyword",
+              "letter",
+              "lowercase",
+              "nGram",
+              "ngram",
+              "path_hierarchy",
+              "pattern",
+              "simple_pattern",
+              "simple_pattern_split",
+              "thai",
+              "uax_url_email",
+              "whitespace"
+            ],
+            "tokenFilters" : [
+              "apostrophe",
+              "arabic_normalization",
+              "arabic_stem",
+              "asciifolding",
+              "bengali_normalization",
+              "brazilian_stem",
+              "cjk_bigram",
+              "cjk_width",
+              "classic",
+              "common_grams",
+              "concatenate_graph",
+              "condition",
+              "czech_stem",
+              "decimal_digit",
+              "delimited_payload",
+              "delimited_term_freq",
+              "dictionary_decompounder",
+              "dutch_stem",
+              "edgeNGram",
+              "edge_ngram",
+              "elision",
+              "fingerprint",
+              "flatten_graph",
+              "french_stem",
+              "german_normalization",
+              "german_stem",
+              "hindi_normalization",
+              "hyphenation_decompounder",
+              "indic_normalization",
+              "keep",
+              "keep_types",
+              "keyword_marker",
+              "kstem",
+              "length",
+              "limit",
+              "lowercase",
+              "min_hash",
+              "multiplexer",
+              "nGram",
+              "ngram",
+              "pattern_capture",
+              "pattern_replace",
+              "persian_normalization",
+              "porter_stem",
+              "predicate_token_filter",
+              "remove_duplicates",
+              "reverse",
+              "russian_stem",
+              "scandinavian_folding",
+              "scandinavian_normalization",
+              "serbian_normalization",
+              "snowball",
+              "sorani_normalization",
+              "stemmer",
+              "stemmer_override",
+              "synonym",
+              "synonym_graph",
+              "trim",
+              "truncate",
+              "unique",
+              "uppercase",
+              "word_delimiter",
+              "word_delimiter_graph"
+            ],
+            "charFilters" : [
+              "html_strip",
+              "mapping",
+              "pattern_replace"
+            ],
+            "hunspellDictionaries" : [ ]
+          }
+        ]
+      }
+    }
+  }
+}
+```
+</details>
+
 ## Text analysis at indexing time and query time
 
 OpenSearch performs text analysis on text fields when you index a document and when you send a search request. Depending on the time of text analysis, the analyzers used for it are classified as follows:

diff --git a/_api-reference/nodes-apis/nodes-info.md b/_api-reference/nodes-apis/nodes-info.md
@@ -69,6 +69,7 @@ plugins | Information about installed plugins and modules.
 ingest | Information about ingest pipelines and available ingest processors.
 aggregations | Information about available [aggregations]({{site.url}}{{site.baseurl}}/opensearch/aggregations).
 indices | Static index settings configured at the node level.
+analysis_components | Information about available [text analysis]({{site.url}}{{site.baseurl}}/analyzers/) components.
 
 ## Query parameters
 
@@ -162,6 +163,7 @@ plugins | Information about the installed plugins, including name, version, Open
 modules | Information about the modules, including name, version, OpenSearch version, Java version, description, class name, custom folder name, a list of extended plugins, and `has_native_controller`, which specifies whether the plugin has a native controller process. Modules are different from plugins because modules are loaded into OpenSearch automatically, while plugins have to be installed manually.
 ingest | Information about ingest pipelines and processors.
 aggregations | Information about the available aggregation types.
+analysis_components | Information about available [text analysis]({{site.url}}{{site.baseurl}}/analyzers/) components.
 
 
 ## Required permissions