From 01ba71bc1cf84fda52bef9cc7a84327d4604f97f Mon Sep 17 00:00:00 2001 From: Naarcha-AWS Date: Wed, 8 Mar 2023 14:44:16 -0600 Subject: [PATCH 1/3] Manual Backport for #2946 Signed-off-by: Naarcha-AWS --- _api-reference/alias.md | 3 + .../analyze-apis/perform-text-analysis.md | 12 +- _api-reference/cat/cat-allocation.md | 4 +- _api-reference/cat/cat-cluster_manager.md | 14 +- _api-reference/cat/cat-indices.md | 2 +- _api-reference/cat/cat-nodeattrs.md | 4 +- _api-reference/cat/cat-nodes.md | 8 +- _api-reference/cat/cat-pending-tasks.md | 4 +- _api-reference/cat/cat-plugins.md | 4 +- _api-reference/cat/cat-repositories.md | 4 +- _api-reference/cat/cat-segments.md | 2 +- _api-reference/cat/cat-shards.md | 4 +- _api-reference/cat/cat-snapshots.md | 2 +- _api-reference/cat/cat-templates.md | 4 +- _api-reference/cat/cat-thread-pool.md | 4 +- .../cluster-api/cluster-awareness.md | 130 ++ .../cluster-api/cluster-decommission.md | 86 ++ _api-reference/cluster-api/cluster-health.md | 115 +- .../cluster-api/cluster-settings.md | 9 +- _api-reference/cluster-api/cluster-stats.md | 6 +- _api-reference/cluster-api/index.md | 2 +- .../common-parameters.md | 2 + _api-reference/count.md | 2 +- _api-reference/document-apis/bulk.md | 15 +- .../index-apis/clear-index-cache.md | 6 +- _api-reference/index-apis/dangling-index.md | 4 +- _api-reference/index-apis/get-settings.md | 3 +- _api-reference/index-apis/put-mapping.md | 4 +- _api-reference/index-apis/shrink-index.md | 20 +- _api-reference/index-apis/update-settings.md | 2 +- _api-reference/multi-search.md | 2 +- _api-reference/nodes-apis/index.md | 16 +- _api-reference/nodes-apis/nodes-info.md | 8 +- _api-reference/nodes-apis/nodes-stats.md | 261 ++-- _api-reference/nodes-apis/nodes-usage.md | 2 +- .../popular-api.md | 2 + _api-reference/rank-eval.md | 2 +- _api-reference/reload-search-analyzer.md | 2 +- .../script-apis/create-stored-script.md | 14 +- _api-reference/script-apis/delete-script.md | 8 +- _api-reference/script-apis/exec-script.md | 1 - .../script-apis/exec-stored-script.md | 8 +- .../script-apis/get-script-contexts.md | 8 +- .../script-apis/get-script-language.md | 2 +- .../script-apis/get-stored-script.md | 10 +- _api-reference/search.md | 2 +- _api-reference/snapshots/create-repository.md | 4 +- _api-reference/snapshots/create-snapshot.md | 15 +- .../snapshots/delete-snapshot-repository.md | 2 +- _api-reference/snapshots/delete-snapshot.md | 2 +- .../snapshots/get-snapshot-repository.md | 8 +- .../snapshots/get-snapshot-status.md | 6 +- _api-reference/snapshots/get-snapshot.md | 6 +- _api-reference/snapshots/restore-snapshot.md | 9 +- .../snapshots/verify-snapshot-repository.md | 12 +- _api-reference/tasks.md | 57 +- {_opensearch => _api-reference}/units.md | 2 + .../data-prepper/data-prepper-reference.md | 439 ------- _clients/data-prepper/get-started.md | 63 - _clients/data-prepper/index.md | 15 - _clients/data-prepper/pipelines.md | 286 ----- _clients/javascript/helpers.md | 202 ---- _clients/javascript/index.md | 356 ------ _config.yml | 41 +- .../alias.md | 2 + .../autocomplete.md | 2 + .../binary.md | 2 + .../boolean.md | 2 + .../completion.md | 4 +- .../date.md | 5 +- .../geo-point.md | 15 + .../geo-shape.md | 46 +- .../geographic.md | 2 + .../index.md | 1 + .../ip.md | 2 + .../join.md | 2 + .../keyword.md | 2 + {_opensearch => _field-types}/mappings.md | 81 +- .../nested.md | 16 +- .../numeric.md | 2 + .../object-fields.md | 0 .../object.md | 2 + .../percolator.md | 2 + .../range.md | 15 +- .../rank.md | 2 + .../search-as-you-type.md | 2 + .../string.md | 2 + .../text.md | 2 + .../token-count.md | 2 + _field-types/xy-point.md | 105 ++ _field-types/xy-shape.md | 403 +++++++ _field-types/xy.md | 28 + {_opensearch => _im-plugin}/data-streams.md | 6 + {_opensearch => _im-plugin}/index-alias.md | 54 +- _im-plugin/index-rollups/index.md | 514 ++++++-- _im-plugin/index-rollups/rollup-api.md | 2 +- .../index-templates.md | 4 +- _im-plugin/index.md | 277 ++++- _im-plugin/ism/api.md | 147 +++ _im-plugin/ism/error-prevention/api.md | 156 +++ _im-plugin/ism/error-prevention/index.md | 70 ++ .../ism/error-prevention/resolutions.md | 229 ++++ _im-plugin/ism/index.md | 10 +- _im-plugin/ism/managedindexes.md | 2 +- _im-plugin/ism/policies.md | 122 ++ {_opensearch => _im-plugin}/reindex-data.md | 4 +- _im-plugin/security.md | 2 +- _opensearch/index-data.md | 271 ----- _opensearch/logs.md | 174 --- _opensearch/ux.md | 1069 ----------------- .../aggregations}/aggregations.md | 5 +- .../aggregations}/bucket-agg.md | 214 +++- _query-dsl/aggregations/geohexgrid-agg.md | 377 ++++++ .../aggregations}/metric-agg.md | 8 +- .../aggregations}/pipeline-agg.md | 5 +- _query-dsl/analyzers/language-analyzers.md | 43 + .../analyzers/refresh-analyzer.md | 6 +- .../analyzers}/text-analyzers.md | 46 +- .../query-dsl/compound/bool.md | 26 +- .../query-dsl/compound/index.md | 3 + .../query-dsl/full-text/index.md | 139 ++- .../query-dsl/full-text/query-string.md | 3 + .../query-dsl/geo-and-xy/geo-bounding-box.md | 3 + .../query-dsl/geo-and-xy/index.md | 3 + _query-dsl/query-dsl/geo-and-xy/xy.md | 438 +++++++ .../query-dsl/index.md | 4 +- .../query-dsl/query-filter-context.md | 1 + .../query-dsl/span-query.md | 3 + .../query-dsl/term-vs-full-text.md | 1 + {_opensearch => _query-dsl}/query-dsl/term.md | 8 +- _search-plugins/async/security.md | 2 +- _search-plugins/knn/api.md | 6 +- _search-plugins/knn/approximate-knn.md | 102 +- _search-plugins/knn/filter-search-knn.md | 649 ++++++++++ _search-plugins/knn/index.md | 2 +- _search-plugins/knn/jni-libraries.md | 7 +- _search-plugins/knn/knn-index.md | 96 +- _search-plugins/knn/knn-score-script.md | 2 +- _search-plugins/knn/painless-functions.md | 2 +- _search-plugins/knn/performance-tuning.md | 2 +- _search-plugins/knn/settings.md | 2 +- _search-plugins/neural-search.md | 204 ++++ _search-plugins/point-in-time-api.md | 272 +++++ _search-plugins/point-in-time.md | 159 +++ _search-plugins/querqy/index.md | 67 +- _search-plugins/search-relevance/index.md | 157 +++ .../search-template.md | 11 + .../searching-data}/autocomplete.md | 2 + .../searching-data}/did-you-mean.md | 0 .../searching-data}/highlight.md | 2 + .../searching-data}/index.md | 2 +- .../searching-data}/paginate.md | 2 + .../searching-data}/sort.md | 2 + _search-plugins/sql/full-text.md | 2 +- {_clients => _tools}/cli.md | 4 +- {_clients => _tools}/grafana.md | 2 +- .../index.md | 55 +- _tools/k8s-operator.md | 147 +++ .../logstash/advanced-config.md | 0 .../logstash/common-filters.md | 0 .../logstash/execution-model.md | 0 {_clients => _tools}/logstash/index.md | 2 +- .../logstash/read-from-opensearch.md | 0 .../logstash/ship-to-opensearch.md | 12 +- 164 files changed, 5982 insertions(+), 3548 deletions(-) create mode 100644 _api-reference/cluster-api/cluster-awareness.md create mode 100644 _api-reference/cluster-api/cluster-decommission.md rename {_opensearch => _api-reference}/common-parameters.md (98%) rename {_opensearch => _api-reference}/popular-api.md (98%) rename {_opensearch => _api-reference}/units.md (97%) delete mode 100644 _clients/data-prepper/data-prepper-reference.md delete mode 100644 _clients/data-prepper/get-started.md delete mode 100644 _clients/data-prepper/index.md delete mode 100644 _clients/data-prepper/pipelines.md delete mode 100644 _clients/javascript/helpers.md delete mode 100644 _clients/javascript/index.md rename {_opensearch/supported-field-types => _field-types}/alias.md (97%) rename {_opensearch/supported-field-types => _field-types}/autocomplete.md (91%) rename {_opensearch/supported-field-types => _field-types}/binary.md (95%) rename {_opensearch/supported-field-types => _field-types}/boolean.md (98%) rename {_opensearch/supported-field-types => _field-types}/completion.md (98%) rename {_opensearch/supported-field-types => _field-types}/date.md (99%) rename {_opensearch/supported-field-types => _field-types}/geo-point.md (88%) rename {_opensearch/supported-field-types => _field-types}/geo-shape.md (86%) rename {_opensearch/supported-field-types => _field-types}/geographic.md (90%) rename {_opensearch/supported-field-types => _field-types}/index.md (99%) rename {_opensearch/supported-field-types => _field-types}/ip.md (98%) rename {_opensearch/supported-field-types => _field-types}/join.md (99%) rename {_opensearch/supported-field-types => _field-types}/keyword.md (98%) rename {_opensearch => _field-types}/mappings.md (69%) rename {_opensearch/supported-field-types => _field-types}/nested.md (95%) rename {_opensearch/supported-field-types => _field-types}/numeric.md (98%) rename {_opensearch/supported-field-types => _field-types}/object-fields.md (100%) rename {_opensearch/supported-field-types => _field-types}/object.md (98%) rename {_opensearch/supported-field-types => _field-types}/percolator.md (97%) rename {_opensearch/supported-field-types => _field-types}/range.md (90%) rename {_opensearch/supported-field-types => _field-types}/rank.md (99%) rename {_opensearch/supported-field-types => _field-types}/search-as-you-type.md (98%) rename {_opensearch/supported-field-types => _field-types}/string.md (91%) rename {_opensearch/supported-field-types => _field-types}/text.md (99%) rename {_opensearch/supported-field-types => _field-types}/token-count.md (98%) create mode 100644 _field-types/xy-point.md create mode 100644 _field-types/xy-shape.md create mode 100644 _field-types/xy.md rename {_opensearch => _im-plugin}/data-streams.md (98%) rename {_opensearch => _im-plugin}/index-alias.md (71%) rename {_opensearch => _im-plugin}/index-templates.md (99%) create mode 100644 _im-plugin/ism/error-prevention/api.md create mode 100644 _im-plugin/ism/error-prevention/index.md create mode 100644 _im-plugin/ism/error-prevention/resolutions.md rename {_opensearch => _im-plugin}/reindex-data.md (99%) delete mode 100644 _opensearch/index-data.md delete mode 100644 _opensearch/logs.md delete mode 100644 _opensearch/ux.md rename {_opensearch => _query-dsl/aggregations}/aggregations.md (98%) rename {_opensearch => _query-dsl/aggregations}/bucket-agg.md (86%) create mode 100644 _query-dsl/aggregations/geohexgrid-agg.md rename {_opensearch => _query-dsl/aggregations}/metric-agg.md (99%) rename {_opensearch => _query-dsl/aggregations}/pipeline-agg.md (99%) create mode 100644 _query-dsl/analyzers/language-analyzers.md rename _im-plugin/refresh-analyzer/index.md => _query-dsl/analyzers/refresh-analyzer.md (87%) rename {_opensearch/query-dsl => _query-dsl/analyzers}/text-analyzers.md (73%) rename {_opensearch => _query-dsl}/query-dsl/compound/bool.md (73%) rename {_opensearch => _query-dsl}/query-dsl/compound/index.md (93%) rename {_opensearch => _query-dsl}/query-dsl/full-text/index.md (63%) rename {_opensearch => _query-dsl}/query-dsl/full-text/query-string.md (98%) rename {_opensearch => _query-dsl}/query-dsl/geo-and-xy/geo-bounding-box.md (98%) rename {_opensearch => _query-dsl}/query-dsl/geo-and-xy/index.md (96%) create mode 100644 _query-dsl/query-dsl/geo-and-xy/xy.md rename {_opensearch => _query-dsl}/query-dsl/index.md (98%) rename {_opensearch => _query-dsl}/query-dsl/query-filter-context.md (98%) rename {_opensearch => _query-dsl}/query-dsl/span-query.md (94%) rename {_opensearch => _query-dsl}/query-dsl/term-vs-full-text.md (99%) rename {_opensearch => _query-dsl}/query-dsl/term.md (95%) create mode 100644 _search-plugins/knn/filter-search-knn.md create mode 100644 _search-plugins/neural-search.md create mode 100644 _search-plugins/point-in-time-api.md create mode 100644 _search-plugins/point-in-time.md create mode 100644 _search-plugins/search-relevance/index.md rename {_opensearch => _search-plugins}/search-template.md (97%) rename {_opensearch/search => _search-plugins/searching-data}/autocomplete.md (99%) rename {_opensearch/search => _search-plugins/searching-data}/did-you-mean.md (100%) rename {_opensearch/search => _search-plugins/searching-data}/highlight.md (99%) rename {_opensearch/search => _search-plugins/searching-data}/index.md (98%) rename {_opensearch/search => _search-plugins/searching-data}/paginate.md (99%) rename {_opensearch/search => _search-plugins/searching-data}/sort.md (99%) rename {_clients => _tools}/cli.md (98%) rename {_clients => _tools}/grafana.md (95%) rename {_clients/agents-and-ingestion-tools => _tools}/index.md (55%) create mode 100644 _tools/k8s-operator.md rename {_clients => _tools}/logstash/advanced-config.md (100%) rename {_clients => _tools}/logstash/common-filters.md (100%) rename {_clients => _tools}/logstash/execution-model.md (100%) rename {_clients => _tools}/logstash/index.md (99%) rename {_clients => _tools}/logstash/read-from-opensearch.md (100%) rename {_clients => _tools}/logstash/ship-to-opensearch.md (86%) diff --git a/_api-reference/alias.md b/_api-reference/alias.md index 5aa153bb98..349f4ac635 100644 --- a/_api-reference/alias.md +++ b/_api-reference/alias.md @@ -32,6 +32,7 @@ POST _aliases ] } ``` +{% include copy-curl.html %} ## Path and HTTP methods @@ -77,3 +78,5 @@ search_routing | String | Assigns a custom value to a shard only for search oper "acknowledged": true } ``` + +For more alias API operations, see [Index aliases]({{site.url}}{{site.baseurl}}/opensearch/index-alias/). \ No newline at end of file diff --git a/_api-reference/analyze-apis/perform-text-analysis.md b/_api-reference/analyze-apis/perform-text-analysis.md index 73d59c62ad..cca29a3db7 100644 --- a/_api-reference/analyze-apis/perform-text-analysis.md +++ b/_api-reference/analyze-apis/perform-text-analysis.md @@ -29,7 +29,7 @@ Although you can issue an analyzer request via both `GET` and `POST` requests, t You can include the following optional path parameter in your request. -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- index | String | Index that is used to derive the analyzer. @@ -37,7 +37,7 @@ index | String | Index that is used to derive the analyzer. You can include the following optional query parameters in your request. -Field | Data Type | Description +Field | Data type | Description :--- | :--- | :--- analyzer | String | The name of the analyzer to apply to the `text` field. The analyzer can be built in or configured in the index.

If `analyzer` is not specified, the analyze API uses the analyzer defined in the mapping of the `field` field.

If the `field` field is not specified, the analyze API uses the default analyzer for the index.

If no index is specified or the index does not have a default analyzer, the analyze API uses the standard analyzer. attributes | Array of Strings | Array of token attributes for filtering the output of the `explain` field. @@ -50,7 +50,7 @@ tokenizer | String | Tokenizer for converting the `text` field into tokens. The following query parameter is required. -Field | Data Type | Description +Field | Data type | Description :--- | :--- | :--- text | String or Array of Strings | Text to analyze. If you provide an array of strings, the text is analyzed as a multi-value field. @@ -656,14 +656,14 @@ The preceding request is an index API rather than an analyze API. See [DYNAMIC I The text analysis endpoints return the following response fields. -Field | Data Type | Description +Field | Data type | Description :--- | :--- | :--- tokens | Array | Array of tokens derived from the `text`. See [token object](#token-object). detail | Object | Details about the analysis and each token. Included only when you request token details. See [detail object](#detail-object). #### Token object -Field | Data Type | Description +Field | Data type | Description :--- | :--- | :--- token | String | The token's text. start_offset | Integer | The token's starting position within the original text string. Offsets are zero-based. @@ -673,7 +673,7 @@ position | Integer | The token's position within the `tokens` array. #### Detail object -Field | Data Type | Description +Field | Data type | Description :--- | :--- | :--- custom_analyzer | Boolean | Whether the analyzer applied to the text is custom or built in. charfilters | Array | List of character filters applied to the text. diff --git a/_api-reference/cat/cat-allocation.md b/_api-reference/cat/cat-allocation.md index f6184c6b7b..96a9395053 100644 --- a/_api-reference/cat/cat-allocation.md +++ b/_api-reference/cat/cat-allocation.md @@ -50,8 +50,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. ## Response diff --git a/_api-reference/cat/cat-cluster_manager.md b/_api-reference/cat/cat-cluster_manager.md index b629d2e35b..2508fce675 100644 --- a/_api-reference/cat/cat-cluster_manager.md +++ b/_api-reference/cat/cat-cluster_manager.md @@ -1,40 +1,40 @@ --- layout: default -title: CAT master +title: CAT cluster manager parent: CAT API nav_order: 30 has_children: false --- -# CAT master +# CAT cluster_manager Introduced 1.0 {: .label .label-purple } -The CAT master operation lists information that helps identify the elected master node. +The CAT cluster manager operation lists information that helps identify the elected cluster manager node. ## Example ``` -GET _cat/master?v +GET _cat/cluster_manager?v ``` {% include copy-curl.html %} ## Path and HTTP methods ``` -GET _cat/master +GET _cat/cluster_manager ``` ## URL parameters -All CAT master URL parameters are optional. +All CAT cluster manager URL parameters are optional. In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-reference/cat/index), you can specify the following parameters: Parameter | Type | Description :--- | :--- | :--- -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. ## Response ```json diff --git a/_api-reference/cat/cat-indices.md b/_api-reference/cat/cat-indices.md index 45ac04e2e4..32c983bf07 100644 --- a/_api-reference/cat/cat-indices.md +++ b/_api-reference/cat/cat-indices.md @@ -52,7 +52,7 @@ Parameter | Type | Description bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). health | String | Limit indices based on their health status. Supported values are `green`, `yellow`, and `red`. include_unloaded_segments | Boolean | Whether to include information from segments not loaded into memory. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. pri | Boolean | Whether to return information only from the primary shards. Default is false. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). expand_wildcards | Enum | Expands wildcard expressions to concrete indices. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. diff --git a/_api-reference/cat/cat-nodeattrs.md b/_api-reference/cat/cat-nodeattrs.md index 8c3046abcc..ebfb58cb42 100644 --- a/_api-reference/cat/cat-nodeattrs.md +++ b/_api-reference/cat/cat-nodeattrs.md @@ -34,8 +34,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. ## Response diff --git a/_api-reference/cat/cat-nodes.md b/_api-reference/cat/cat-nodes.md index 6fff3bc032..ace8a287e2 100644 --- a/_api-reference/cat/cat-nodes.md +++ b/_api-reference/cat/cat-nodes.md @@ -13,7 +13,7 @@ Introduced 1.0 The CAT nodes operation lists node-level information, including node roles and load metrics. -A few important node metrics are `pid`, `name`, `master`, `ip`, `port`, `version`, `build`, `jdk`, along with `disk`, `heap`, `ram`, and `file_desc`. +A few important node metrics are `pid`, `name`, `cluster_manager`, `ip`, `port`, `version`, `build`, `jdk`, along with `disk`, `heap`, `ram`, and `file_desc`. ## Example @@ -38,8 +38,8 @@ Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). full_id | Boolean | If true, return the full node ID. If false, return the shortened node ID. Defaults to false. -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). include_unloaded_segments | Boolean | Whether to include information from segments not loaded into memory. Default is false. @@ -47,6 +47,6 @@ include_unloaded_segments | Boolean | Whether to include information from segmen ## Response ```json -ip | heap.percent | ram.percent | cpu load_1m | load_5m | load_15m | node.role | node.roles | master | name +ip | heap.percent | ram.percent | cpu load_1m | load_5m | load_15m | node.role | node.roles | cluster_manager | name 10.11.1.225 | 31 | 32 | 0 | 0.00 | 0.00 | di | data,ingest,ml | - | data-e5b89ad7 ``` diff --git a/_api-reference/cat/cat-pending-tasks.md b/_api-reference/cat/cat-pending-tasks.md index e3dc2c7e92..fa9a581017 100644 --- a/_api-reference/cat/cat-pending-tasks.md +++ b/_api-reference/cat/cat-pending-tasks.md @@ -34,8 +34,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). ## Response diff --git a/_api-reference/cat/cat-plugins.md b/_api-reference/cat/cat-plugins.md index 99dcf29b65..2f53512775 100644 --- a/_api-reference/cat/cat-plugins.md +++ b/_api-reference/cat/cat-plugins.md @@ -34,8 +34,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. ## Response diff --git a/_api-reference/cat/cat-repositories.md b/_api-reference/cat/cat-repositories.md index 490f62b0b6..997b993cdd 100644 --- a/_api-reference/cat/cat-repositories.md +++ b/_api-reference/cat/cat-repositories.md @@ -34,8 +34,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. ## Response diff --git a/_api-reference/cat/cat-segments.md b/_api-reference/cat/cat-segments.md index 453d2ed46d..131a7a74a5 100644 --- a/_api-reference/cat/cat-segments.md +++ b/_api-reference/cat/cat-segments.md @@ -49,7 +49,7 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/).. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. ## Response diff --git a/_api-reference/cat/cat-shards.md b/_api-reference/cat/cat-shards.md index 49efa2d579..d07127c46e 100644 --- a/_api-reference/cat/cat-shards.md +++ b/_api-reference/cat/cat-shards.md @@ -49,8 +49,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- bytes | Byte size | Specify the units for byte size. For example, `7kb` or `6gb`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). diff --git a/_api-reference/cat/cat-snapshots.md b/_api-reference/cat/cat-snapshots.md index 3adc7e5ebe..fcdd3c61ba 100644 --- a/_api-reference/cat/cat-snapshots.md +++ b/_api-reference/cat/cat-snapshots.md @@ -34,7 +34,7 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. time | Time | Specify the units for time. For example, `5d` or `7h`. For more information, see [Supported units]({{site.url}}{{site.baseurl}}/opensearch/units/). diff --git a/_api-reference/cat/cat-templates.md b/_api-reference/cat/cat-templates.md index 90232b42a5..f361ae26f4 100644 --- a/_api-reference/cat/cat-templates.md +++ b/_api-reference/cat/cat-templates.md @@ -42,8 +42,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. ## Response diff --git a/_api-reference/cat/cat-thread-pool.md b/_api-reference/cat/cat-thread-pool.md index e40501d82e..9a48b0d017 100644 --- a/_api-reference/cat/cat-thread-pool.md +++ b/_api-reference/cat/cat-thread-pool.md @@ -47,8 +47,8 @@ In addition to the [common URL parameters]({{site.url}}{{site.baseurl}}/api-refe Parameter | Type | Description :--- | :--- | :--- -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +local | Boolean | Whether to return information from the local node only instead of from the cluster_manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster_manager node. Default is 30 seconds. ## Response diff --git a/_api-reference/cluster-api/cluster-awareness.md b/_api-reference/cluster-api/cluster-awareness.md new file mode 100644 index 0000000000..0c0fd49dcd --- /dev/null +++ b/_api-reference/cluster-api/cluster-awareness.md @@ -0,0 +1,130 @@ +--- +layout: default +title: Cluster routing and awareness +nav_order: 20 +parent: Cluster APIs +has_children: false +redirect_from: + - /api-reference/cluster-awareness/ +--- + +# Cluster routing and awareness + +To control the distribution of search or HTTP traffic, you can use the weights per awareness attribute to control the distribution of search or HTTP traffic across zones. This is commonly used for zonal deployments, heterogeneous instances, and routing traffic away from zones during zonal failure. + +## Path and HTTP methods + +``` +PUT /_cluster/routing/awareness//weights +GET /_cluster/routing/awareness//weights?local +GET /_cluster/routing/awareness//weights +``` + +## Path parameters + +Parameter | Type | Description +:--- | :--- | :--- +attribute | String | The name of the awareness attribute, usually `zone`. The attribute name must match the values listed in the request body when assigning weights to zones. + +## Request body parameters + +Parameter | Type | Description +:--- | :--- | :--- +weights | JSON object | Assigns weights to attributes within the request body of the PUT request. Weights can be set in any ratio, for example, 2:3:5. In a 2:3:5 ratio with 3 zones, for every 100 requests sent to the cluster, each zone would receive either 20, 30, or 50 search requests in a random order. When assigned a weight of `0`, the zone does not receive any search traffic. +_version | String | Implements optimistic concurrency control (OCC) through versioning. The parameter uses simple versioning, such as `1`, and increments upward based on each subsequent modification. This allows any servers from which a request originates to validate whether or not a zone has been modified. + + +In the following example request body, `zone_1` and `zone_2` receive 50 requests each, whereas `zone_3` is prevented from receiving requests: + +``` +{ + "weights": + { + "zone_1": "5", + "zone_2": "5", + "zone_3": "0" + } + "_version" : 1 +} +``` + +## Example: Weighted round robin search + +The following example request creates a round robin shard allocation for search traffic by using an undefined ratio: + +### Request + +```json +PUT /_cluster/routing/awareness/zone/weights +{ + "weights": + { + "zone_1": "1", + "zone_2": "1", + "zone_3": "0" + } + "_version" : 1 +} +``` +{% include copy-curl.html %} + +### Response + +``` +{ + "acknowledged": true +} +``` + + +## Example: Getting weights for all zones + +The following example request gets weights for all zones. + +### Request + +```json +GET /_cluster/routing/awareness/zone/weights +``` +{% include copy-curl.html %} + +### Response + +OpenSearch responds with the weight of each zone: + +```json +{ + "weights": + { + + "zone_1": "1.0", + "zone_2": "1.0", + "zone_3": "0.0" + }, + "_version":1 +} +``` + +## Example: Deleting weights + +You can remove your weight ratio for each zone using the `DELETE` method. + +### Request + +```json +DELETE /_cluster/routing/awareness/zone/weights +``` +{% include copy-curl.html %} + +### Response + +```json +{ + "_version":1 +} +``` + +## Next steps + +- For more information about zone commissioning, see [Cluster decommission]({{site.url}}{{site.baseurl}}/api-reference/cluster-decommission/). +- For more information about allocation awareness, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/#advanced-step-6-configure-shard-allocation-awareness-or-forced-awareness). diff --git a/_api-reference/cluster-api/cluster-decommission.md b/_api-reference/cluster-api/cluster-decommission.md new file mode 100644 index 0000000000..e64e2675fe --- /dev/null +++ b/_api-reference/cluster-api/cluster-decommission.md @@ -0,0 +1,86 @@ +--- +layout: default +title: Cluster decommission +nav_order: 30 +parent: Cluster APIs +has_children: false +redirect_from: + - /api-reference/cluster-decommission/ +--- + +# Cluster decommission + +The cluster decommission operation adds support decommissioning based on awareness. It greatly benefits multi-zone deployments, where awareness attributes, such as `zones`, can aid in applying new upgrades to a cluster in a controlled fashion. This is especially useful during outages, in which case, you can decommission the unhealthy zone to prevent replication requests from stalling and prevent your request backlog from becoming too large. + +For more information about allocation awareness, see [Shard allocation awareness]({{site.url}}{{site.baseurl}}//opensearch/cluster/#shard-allocation-awareness). + + +## HTTP and Path methods + +``` +PUT /_cluster/decommission/awareness/{awareness_attribute_name}/{awareness_attribute_value} +GET /_cluster/decommission/awareness/{awareness_attribute_name}/_status +DELETE /_cluster/decommission/awareness +``` + +## URL parameters + +Parameter | Type | Description +:--- | :--- | :--- +awareness_attribute_name | String | The name of awareness attribute, usually `zone`. +awareness_attribute_value | String | The value of the awareness attribute. For example, if you have shards allocated in two different zones, you can give each zone a value of `zone-a` or `zoneb`. The cluster decommission operation decommissions the zone listed in the method. + + +## Example: Decommissioning and recommissioning a zone + +You can use the following example requests to decommission and recommission a zone: + +### Request + +The following example request decommissions `zone-a`: + +```json +PUT /_cluster/decommission/awareness// +``` +{% include copy-curl.html %} + +If you want to recommission a decommissioned zone, you can use the `DELETE` method: + +```json +DELETE /_cluster/decommission/awareness +``` +{% include copy-curl.html %} + +### Response + + +```json +{ + "acknowledged": true +} +``` + +## Example: Getting zone decommission status + +The following example requests returns the decommission status of all zones. + +### Request + +```json +GET /_cluster/decommission/awareness/zone/_status +``` +{% include copy-curl.html %} + +### Response + +```json +{ + "zone-1": "INIT | DRAINING | IN_PROGRESS | SUCCESSFUL | FAILED" +} +``` + + +## Next steps + +- For more information about zone awareness and weight, see [Cluster awareness]({{site.url}}{{site.baseurl}}/api-reference/cluster-awareness/). +- For more information about allocation awareness, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/#advanced-step-6-configure-shard-allocation-awareness-or-forced-awareness). diff --git a/_api-reference/cluster-api/cluster-health.md b/_api-reference/cluster-api/cluster-health.md index fac9afa439..9d8fa2f540 100644 --- a/_api-reference/cluster-api/cluster-health.md +++ b/_api-reference/cluster-api/cluster-health.md @@ -40,9 +40,10 @@ The following table lists the available query parameters. All query parameters a Parameter | Type | Description :--- | :--- | :--- expand_wildcards | Enum | Expands wildcard expressions to concrete indexes. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. -level | Enum | The level of detail for returned health information. Supported values are `cluster`, `indices`, and `shards`. Default is `cluster`. -local | Boolean | Whether to return information from the local node only instead of from the master node. Default is false. -master_timeout | Time | The amount of time to wait for a connection to the master node. Default is 30 seconds. +level | Enum | The level of detail for returned health information. Supported values are `cluster`, `indices`, `shards`, and `awareness_attributes`. Default is `cluster`. +awareness_attribute | String | The name of the awareness attribute, for which to return cluster health (for example, `zone`). Applicable only if `level` is set to `awareness_attributes`. +local | Boolean | Whether to return information from the local node only instead of from the cluster manager node. Default is false. +cluster_manager_timeout | Time | The amount of time to wait for a connection to the cluster manager node. Default is 30 seconds. timeout | Time | The amount of time to wait for a response. If the timeout expires, the request fails. Default is 30 seconds. wait_for_active_shards | String | Wait until the specified number of shards is active before returning a response. `all` for all shards. Default is `0`. wait_for_nodes | String | Wait for N number of nodes. Use `12` for exact match, `>12` and `<12` for range. @@ -50,6 +51,7 @@ wait_for_events | Enum | Wait until all currently queued events with the given p wait_for_no_relocating_shards | Boolean | Whether to wait until there are no relocating shards in the cluster. Default is false. wait_for_no_initializing_shards | Boolean | Whether to wait until there are no initializing shards in the cluster. Default is false. wait_for_status | Enum | Wait until the cluster health reaches the specified status or better. Supported values are `green`, `yellow`, and `red`. +weights | JSON object | Assigns weights to attributes within the request body of the PUT request. Weights can be set in any ration, for example, 2:3:5. In a 2:3:5 ratio with three zones, for every 100 requests sent to the cluster, each zone would receive either 20, 30, or 50 search requests in a random order. When assigned a weight of `0`, the zone does not receive any search traffic. #### Example request @@ -58,6 +60,7 @@ The following example request retrieves cluster health for all indexes in the cl ```json GET _cluster/health ``` +{% include copy-curl.html %} #### Example response @@ -94,7 +97,7 @@ The following table lists all response fields. |status | String | The cluster health status, which represents the state of shard allocation in the cluster. May be `green`, `yellow`, or `red`. | |number_of_nodes | Integer | The number of nodes in the cluster. | |number_of_data_nodes | Integer | The number of data nodes in the cluster. | -|discovered_master | Boolean | Specifies whether the master node is discovered. | +|discovered_cluster_manager | Boolean | Specifies whether the cluster manager is discovered. | |active_primary_shards | Integer | The number of active primary shards. | |active_shards | Integer | The total number of active shards, including primary and replica shards. | |relocating_shards | Integer | The number of relocating shards. | @@ -105,8 +108,108 @@ The following table lists all response fields. |number_of_in_flight_fetch | Integer | The number of unfinished fetches. | |task_max_waiting_in_queue_millis | Integer | The maximum wait time for all tasks waiting to be performed, in milliseconds. | |active_shards_percent_as_number | Double | The percentage of active shards in the cluster. | +|awareness_attributes | Object | Contains cluster health information for each awareness attribute. | + +## Returning cluster health by awareness attribute + +To check cluster health by awareness attribute (for example, zone or rack), specify `awareness_attributes` in the `level` query parameter: + +```json +GET _cluster/health?level=awareness_attributes +``` +{% include copy-curl.html %} + +The response contains cluster health metrics partitioned by awareness attribute: + +```json +{ + "cluster_name": "runTask", + "status": "green", + "timed_out": false, + "number_of_nodes": 3, + "number_of_data_nodes": 3, + "discovered_master": true, + "discovered_cluster_manager": true, + "active_primary_shards": 0, + "active_shards": 0, + "relocating_shards": 0, + "initializing_shards": 0, + "unassigned_shards": 0, + "delayed_unassigned_shards": 0, + "number_of_pending_tasks": 0, + "number_of_in_flight_fetch": 0, + "task_max_waiting_in_queue_millis": 0, + "active_shards_percent_as_number": 100, + "awareness_attributes": { + "zone": { + "zone-3": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "zone-1": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "zone-2": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + } + }, + "rack": { + "rack-3": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "rack-1": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + }, + "rack-2": { + "active_shards": 0, + "initializing_shards": 0, + "relocating_shards": 0, + "unassigned_shards": 0, + "data_nodes": 1, + "weight": 1 + } + } + } +} +``` + +If you're interested in a particular awareness attribute, you can include the name of the awareness attribute as a query parameter: + +```json +GET _cluster/health?level=awareness_attributes&awareness_attribute=zone +``` +{% include copy-curl.html %} + +In response to the preceding request, OpenSearch returns cluster health information only for the `zone` awareness attribute. + +The unassigned shard information will be accurate only if you [enable replica count enforcement]({{site.url}}{{site.baseurl}}/opensearch/cluster#replica-count-enforcement) and [configure forced awareness]({{site.url}}{{site.baseurl}}/opensearch/cluster#forced-awareness) for the awareness attribute either before cluster start or after cluster start but before any indexing requests. If you enable replica enforcement after the cluster receives indexing requests, the unassigned shard information may be inaccurate. If you don't configure replica count enforcement and forced awareness, the `unassigned_shards` field will contain -1. +{: .warning} ## Required permissions -If you use the security plugin, make sure you have the appropriate permissions: -`cluster:monitor/health`. \ No newline at end of file +If you use the Security plugin, make sure you have the appropriate permissions: +`cluster:monitor/health`. diff --git a/_api-reference/cluster-api/cluster-settings.md b/_api-reference/cluster-api/cluster-settings.md index 5b37e8dc12..9d533f4f5a 100644 --- a/_api-reference/cluster-api/cluster-settings.md +++ b/_api-reference/cluster-api/cluster-settings.md @@ -3,7 +3,8 @@ layout: default title: Cluster settings nav_order: 50 parent: Cluster APIs -has_children: false +redirect_from: + - /api-reference/cluster-settings/ --- # Cluster settings @@ -27,7 +28,7 @@ Parameter | Data type | Description :--- | :--- | :--- flat_settings | Boolean | Whether to return settings in the flat form, which can improve readability, especially for heavily nested settings. For example, the flat form of `"cluster": { "max_shards_per_node": 500 }` is `"cluster.max_shards_per_node": "500"`. include_defaults (GET only) | Boolean | Whether to include default settings as part of the response. This parameter is useful for identifying the names and current values of settings you want to update. -master_timeout | Time unit | The amount of time to wait for a response from the master node. Default is `30 seconds`. +cluster_manager_timeout | Time unit | The amount of time to wait for a response from the cluster manager node. Default is `30 seconds`. timeout (PUT only) | Time unit | The amount of time to wait for a response from the cluster. Default is `30 seconds`. #### Example request @@ -103,7 +104,7 @@ The following request field parameters are compatible with the cluster API. | cluster.blocks.read_only_allow_delete | Boolean | Similar to `cluster.blocks.read_only` but allows you to delete indexes. | | cluster.max_shards_per_node | Integer | Limits the total number of primary and replica shards for the cluster. The limit is calculated as follows: `cluster.max_shards_per_node` multiplied by the number of non-frozen data nodes. Shards for closed indexes do not count toward this limit. Default is `1000`. | | cluster.persistent_tasks.allocation.enable | String | Enables or disables allocation for persistent tasks:

`all` – Allows persistent tasks to be assigned to nodes.

`none` – No allocations are allowed for persistent tasks. This does not affect persistent tasks already running.

Default is `all`. | -| cluster.persistent_tasks.allocation.recheck_interval | Time unit | The master node automatically checks whether or not persistent tasks need to be assigned when the cluster state changes in a significant way. There are other factors, such as memory usage, that will affect whether or not persistent tasks are assigned to nodes but do not otherwise cause the cluster state to change. This setting defines how often assignment checks are performed in response to these factors. Default is `30 seconds`, with a minimum of `10 seconds` being required. | +| cluster.persistent_tasks.allocation.recheck_interval | Time unit | The cluster manager automatically checks whether or not persistent tasks need to be assigned when the cluster state changes in a significant way. There are other factors, such as memory usage, that will affect whether or not persistent tasks are assigned to nodes but do not otherwise cause the cluster state to change. This setting defines how often assignment checks are performed in response to these factors. Default is `30 seconds`, with a minimum of `10 seconds` being required. | #### Example request @@ -119,7 +120,7 @@ PUT _cluster/settings ``` {% include copy-curl.html %} -For more information about transient settings, persistent settings, and precedence, see [OpenSearch configuration]({{site.url}}{{site.baseurl}}/opensearch/configuration/). +For more information about transient settings, persistent settings, and precedence, see [OpenSearch configuration]({{site.url}}{{site.baseurl}}/install-and-configure/configuration/). #### Example response diff --git a/_api-reference/cluster-api/cluster-stats.md b/_api-reference/cluster-api/cluster-stats.md index 1c6fab89c3..9e56b17807 100644 --- a/_api-reference/cluster-api/cluster-stats.md +++ b/_api-reference/cluster-api/cluster-stats.md @@ -17,7 +17,7 @@ The cluster stats API operation returns statistics about your cluster. ## Examples ```json -GET _cluster/stats/nodes/_master +GET _cluster/stats/nodes/_cluster_manager ``` {% include copy-curl.html %} @@ -38,7 +38,7 @@ Parameter | Type | Description <node-filters> | List | A comma-separated list of [node filters]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/index/#node-filters) that OpenSearch uses to filter results. - Although the `master` node is now called `master` for version 2.0, we retained the `master` field for backwards compatibility. If you have a node that has either a `master` role or a `master` role, the `count` increases for both fields by 1. To see an example node count increase, see the Response sample. + Although the `master` node is now called `cluster_manager` for version 2.0, we retained the `master` field for backwards compatibility. If you have a node that has either a `master` role or a `cluster_manager` role, the `count` increases for both fields by 1. To see an example node count increase, see the Response sample. {: .note } ## Response @@ -225,7 +225,7 @@ Parameter | Type | Description "data": 1, "ingest": 1, "master": 1, - "master": 1, + "cluster_manager": 1, "remote_cluster_client": 1 }, "versions": [ diff --git a/_api-reference/cluster-api/index.md b/_api-reference/cluster-api/index.md index cc45db3bb3..9ce04695f6 100644 --- a/_api-reference/cluster-api/index.md +++ b/_api-reference/cluster-api/index.md @@ -9,4 +9,4 @@ redirect_from: # Cluster APIs -The cluster APIs allow you to manage your cluster. You can use them to check cluster health, modify settings, retrieve statistics, and more. \ No newline at end of file +The cluster APIs allow you to manage your cluster. You can use them to check cluster health, modify settings, retrieve statistics, and more. diff --git a/_opensearch/common-parameters.md b/_api-reference/common-parameters.md similarity index 98% rename from _opensearch/common-parameters.md rename to _api-reference/common-parameters.md index b96bd95a24..eb0fae8cf7 100644 --- a/_opensearch/common-parameters.md +++ b/_api-reference/common-parameters.md @@ -2,6 +2,8 @@ layout: default title: Common REST Parameters nav_order: 93 +redirect_from: + - /opensearch/common-parameters/ --- # Common REST parameters diff --git a/_api-reference/count.md b/_api-reference/count.md index d2fefb523b..fd9f3113ca 100644 --- a/_api-reference/count.md +++ b/_api-reference/count.md @@ -1,7 +1,7 @@ --- layout: default title: Count -nav_order: 20 +nav_order: 21 --- # Count diff --git a/_api-reference/document-apis/bulk.md b/_api-reference/document-apis/bulk.md index 83c343093d..e7bd87324f 100644 --- a/_api-reference/document-apis/bulk.md +++ b/_api-reference/document-apis/bulk.md @@ -103,13 +103,26 @@ All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If - Update - This action updates existing documents and returns an error if the document doesn't exist. The next line must include a full or partial JSON document, depending on how much of the document you want to update. It can also include a script or upsert for more complex document updates. + This action updates existing documents and returns an error if the document doesn't exist. The next line must include a full or partial JSON document, depending on how much of the document you want to update. ```json { "update": { "_index": "movies", "_id": "tt0816711" } } { "doc" : { "title": "World War Z" } } ``` + + It can also include a script or upsert for more complex document updates. + - Script + ```json + { "update": { "_index": "movies", "_id": "tt0816711" } } + { "script" : { "source": "ctx._source.title = \"World War Z\"" } } + ``` + + - Upsert + ```json + { "update": { "_index": "movies", "_id": "tt0816711" } } + { "doc" : { "title": "World War Z" }, "doc_as_upsert": true } + ``` ## Response diff --git a/_api-reference/index-apis/clear-index-cache.md b/_api-reference/index-apis/clear-index-cache.md index f1973b3059..56f3e2e981 100644 --- a/_api-reference/index-apis/clear-index-cache.md +++ b/_api-reference/index-apis/clear-index-cache.md @@ -15,7 +15,7 @@ If you use the security plugin, you must have the `manage index` privileges. ### Path parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | target | String | Comma-delimited list of data streams, indexes, and index aliases to which cache clearing will be applied. Wildcard expressions (`*`) are supported. To target all data streams and indexes in a cluster, omit this parameter or use `_all` or `*`. Optional. | @@ -24,7 +24,7 @@ If you use the security plugin, you must have the `manage index` privileges. All query parameters are optional. -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | allow_no_indices | Boolean | Whether to ignore wildcards, index aliases, or `_all` target (`target` path parameter) values that don’t match any indexes. If `false`, the request returns an error if any wildcard expression, index alias, or `_all` target value doesn't match any indexes. This behavior also applies if the request targets include other open indexes. For example, a request where the target is `fig*,app*` returns an error if an index starts with `fig` but no index starts with `app`. Defaults to `true`. | | expand_wildcards | String | Determines the index types that wildcard expressions can expand to. Accepts multiple values separated by a comma, such as `open,hidden`. Valid values are:

`all` -- Expand to open, closed, and hidden indexes.

`open` -- Expand only to open indexes.

`closed` -- Expand only to closed indexes

`hidden` -- Expand to include hidden indexes. Must be combined with `open`, `closed`, or `both`.

`none` -- Expansions are not accepted.

Defaults to `open`. | @@ -111,7 +111,7 @@ The `POST /books,hockey/_cache/clear` request returns the following fields: The `POST /books,hockey/_cache/clear` request returns the following response fields: -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | _shards | Object | Shard information. | | total | Integer | Total number of shards. | diff --git a/_api-reference/index-apis/dangling-index.md b/_api-reference/index-apis/dangling-index.md index 5703c50dea..653e818959 100644 --- a/_api-reference/index-apis/dangling-index.md +++ b/_api-reference/index-apis/dangling-index.md @@ -41,11 +41,11 @@ index-uuid | UUID of index. Query parameters are optional. -Query parameter | Data Type | Description +Query parameter | Data type | Description :--- | :--- | :--- accept_data_loss | Boolean | Must be set to `true` for an `import` or `delete` because Opensearch is unaware of where the dangling index data came from. timeout | Time units | The amount of time to wait for a response. If no response is received in the defined time period, an error is returned. Default is `30` seconds. -master_timeout | Time units | The amount of time to wait for the connection to the master node. If no response is received in the defined time period, an error is returned. Default is `30` seconds. +master_timeout | Time units | The amount of time to wait for the connection to the cluster manager. If no response is received in the defined time period, an error is returned. Default is `30` seconds. ## Examples diff --git a/_api-reference/index-apis/get-settings.md b/_api-reference/index-apis/get-settings.md index ab03453181..6fa558a760 100644 --- a/_api-reference/index-apis/get-settings.md +++ b/_api-reference/index-apis/get-settings.md @@ -21,6 +21,7 @@ GET /sample-index1/_settings ## Path and HTTP methods ``` +GET /_settings GET //_settings GET //_settings/ ``` @@ -29,7 +30,7 @@ GET //_settings/ All update settings parameters are optional. -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- <target-index> | String | The index to get settings from. Can be a comma-separated list to get settings from multiple indexes, or use `_all` to return settings from all indexes within the cluster. <setting> | String | Filter to return specific settings. diff --git a/_api-reference/index-apis/put-mapping.md b/_api-reference/index-apis/put-mapping.md index ef7a128209..7bf9b243e7 100644 --- a/_api-reference/index-apis/put-mapping.md +++ b/_api-reference/index-apis/put-mapping.md @@ -50,13 +50,13 @@ PUT /sample-index/_mapping?ignore_unavailable The following table defines the put mapping query parameters: -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. ignore_unavailable | Boolean | If true, OpenSearch does not include missing or closed indexes in the response. ignore_malformed | Boolean | Use this parameter with the `ip_range` data type to specify that OpenSearch should ignore malformed fields. If `true`, OpenSearch does not include entries that do not match the IP range specified in the index in the response. The default is `false`. -master_timeout | Time | How long to wait for a connection to the master node. Default is `30s`. +cluster_manager_timeout | Time | How long to wait for a connection to the cluster manager node. Default is `30s`. timeout | Time | How long to wait for the response to return. Default is `30s`. write_index_only | Boolean | Whether OpenSearch should apply mapping updates only to the write index. diff --git a/_api-reference/index-apis/shrink-index.md b/_api-reference/index-apis/shrink-index.md index a8bb503729..1df9269949 100644 --- a/_api-reference/index-apis/shrink-index.md +++ b/_api-reference/index-apis/shrink-index.md @@ -29,6 +29,7 @@ POST /my-old-index/_shrink/my-new-index ``` POST //_shrink/ +PUT //_shrink/ ``` When creating new indices with this operation, remember that OpenSearch indices have the following naming restrictions: @@ -56,6 +57,21 @@ timeout | Time | How long to wait for the request to return a response. Default You can use the request body to configure some index settings for the target index. All fields are optional. Field | Type | Description -alias | Object | Sets an alias for the target index. Can have the fields `filter`, `index_routing`, `is_hidden`, `is_write_index`, `routing`, and `search_routing`. See [Index Aliases]({{site.url}}{{site.baseurl}}/api-reference/alias/#request-body). +:--- | :--- | :--- +alias | Object | Sets an alias for the target index. Can have the fields `filter`, `index_routing`, `is_hidden`, `is_write_index`, `routing`, or `search_routing`. See [Index Aliases]({{site.url}}{{site.baseurl}}/api-reference/alias/#request-body). settings | Object | Index settings you can apply to your target index. See [Index Settings]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/#index-settings). -max_primary_shard_size | Bytes | Sets the maximum size of a primary shard in the target index. For example, if this field is set to 100 GB, and the source index's primary shards total to 300 GB, then the target index has 3 primary shards of 100 GB each. +[max_shard_size](#the-max_shard_size-parameter) | Bytes | Specifies the maximum size of a primary shard in the target index. Because `max_shard_size` conflicts with the `index.number_of_shards` setting, you cannot set both of them at the same time. + +### The `max_shard_size` parameter + +The `max_shard_size` parameter specifies the maximum size of a primary shard in the target index. OpenSearch uses `max_shard_size` and the total storage for all primary shards in the source index to calculate the number of primary shards and their size for the target index. + +The primary shard count of the target index is the smallest factor of the source index's primary shard count for which the shard size does not exceed `max_shard_size`. For example, if the source index has 8 primary shards, they occupy a total of 400 GB of storage, and the `max_shard_size` is equal to 150 GB, OpenSearch calculates the number of primary shards in the target index using the following algorithm: + +1. Calculate the minimum number of primary shards as 400/150, rounded to the nearest whole integer. The minimum number of primary shards is 3. +1. Calculate the number of primary shards as the smallest factor of 8 that is greater than 3. The number of primary shards is 4. + +The maximum number of primary shards for the target index is equal to the number of primary shards in the source index because the shrink operation is used to reduce the primary shard count. As an example, consider a source index with 5 primary shards that occupy a total of 600 GB of storage. If `max_shard_size` is 100 GB, the minimum number of primary shards is 600/100, which is 6. However, because the number of primary shards in the source index is smaller than 6, the number of primary shards in the target index is set to 5. + +The minimum number of primary shards for the target index is 1. +{: .note} \ No newline at end of file diff --git a/_api-reference/index-apis/update-settings.md b/_api-reference/index-apis/update-settings.md index 4bea50b63a..fd507fd19d 100644 --- a/_api-reference/index-apis/update-settings.md +++ b/_api-reference/index-apis/update-settings.md @@ -36,7 +36,7 @@ PUT //_settings All update settings parameters are optional. -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- allow_no_indices | Boolean | Whether to ignore wildcards that don’t match any indexes. Default is `true`. expand_wildcards | String | Expands wildcard expressions to different indexes. Combine multiple values with commas. Available values are `all` (match all indexes), `open` (match open indexes), `closed` (match closed indexes), `hidden` (match hidden indexes), and `none` (do not accept wildcard expressions), which must be used with `open`, `closed`, or both. Default is `open`. diff --git a/_api-reference/multi-search.md b/_api-reference/multi-search.md index eb9dd7c0d0..b895f0c5f0 100644 --- a/_api-reference/multi-search.md +++ b/_api-reference/multi-search.md @@ -59,7 +59,7 @@ All multi-search URL parameters are optional. Some can also be applied per-searc Parameter | Type | Description | Supported in metadata line :--- | :--- | :--- allow_no_indices | Boolean | Whether to ignore wildcards that don't match any indices. Default is `true`. | Yes -cancel_after_time_interval | Time | The time after which the search request will be canceled. Supported at both parent and child request levels. The order of precedence is:
1. Child-level parameter
2. Parent-level parameter
3. [Cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-settings).
Default is -1. | Yes +cancel_after_time_interval | Time | The time after which the search request will be canceled. Supported at both parent and child request levels. The order of precedence is:
1. Child-level parameter
2. Parent-level parameter
3. [Cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings).
Default is -1. | Yes css_minimize_roundtrips | Boolean | Whether OpenSearch should try to minimize the number of network round trips between the coordinating node and remote clusters (only applicable to cross-cluster search requests). Default is `true`. | No expand_wildcards | Enum | Expands wildcard expressions to concrete indices. Combine multiple values with commas. Supported values are `all`, `open`, `closed`, `hidden`, and `none`. Default is `open`. | Yes ignore_unavailable | Boolean | If an index from the indices list doesn’t exist, whether to ignore it rather than fail the query. Default is `false`. | Yes diff --git a/_api-reference/nodes-apis/index.md b/_api-reference/nodes-apis/index.md index fd3749fee2..c6574f9661 100644 --- a/_api-reference/nodes-apis/index.md +++ b/_api-reference/nodes-apis/index.md @@ -31,11 +31,11 @@ Parameter | Type | Description Node filters support several node resolution mechanisms: -- Predefined constants: `_local`, `_master`, or `_all`. +- Predefined constants: `_local`, `_cluster_manager`, or `_all`. - An exact match for `nodeID` - A simple case-sensitive wildcard pattern matching for `node-name`, `host-name`, or `host-IP-address`. - Node roles where the `` value is set either to `true` or `false`: - - `master:` refers to all master-eligible nodes. + - `cluster_manager:` refers to all cluster manager-eligible nodes. - `data:` refers to all data nodes. - `ingest:` refers to all ingest nodes. - `voting_only:` refers to all voting-only nodes. @@ -45,10 +45,10 @@ Node filters support several node resolution mechanisms: Resolution mechanisms are applied sequentially in the order specified by the client. Each mechanism specification can either add or remove nodes. -To get statistics from the elected master node only, use the following query : +To get statistics from the elected cluster manager node only, use the following query : ```json -GET /_nodes/_master/stats +GET /_nodes/_cluster_manager/stats ``` {% include copy-curl.html %} @@ -63,16 +63,16 @@ GET /_nodes/data:true/stats The order of resolution mechanisms is applied sequentially, and each can add or remove nodes. The following examples yield different results. -To get statistics from all the nodes except the master node, use the following query: +To get statistics from all the nodes except the cluster manager node, use the following query: ```json -GET /_nodes/_all,master:false/stats +GET /_nodes/_all,cluster_manager:false/stats ``` {% include copy-curl.html %} -However, if you switch the resolution mechanisms, the result will include all the cluster nodes, including the master node: +However, if you switch the resolution mechanisms, the result will include all the cluster nodes, including the cluster manager node: ```json -GET /_nodes/master:false,_all/stats +GET /_nodes/cluster_manager:false,_all/stats ``` {% include copy-curl.html %} \ No newline at end of file diff --git a/_api-reference/nodes-apis/nodes-info.md b/_api-reference/nodes-apis/nodes-info.md index ce54c5192c..c36bdfde2c 100644 --- a/_api-reference/nodes-apis/nodes-info.md +++ b/_api-reference/nodes-apis/nodes-info.md @@ -25,7 +25,7 @@ GET /_nodes ``` {% include copy-curl.html %} -To get thread pool information about the master node only, use the following query: +To get thread pool information about the cluster manager node only, use the following query: ```json GET /_nodes/master:true/thread_pool @@ -56,7 +56,7 @@ The following table lists all available metric groups. Metric | Description :--- |:---- -settings | A node's settings. This is a combination of the default settings, custom settings from the [configuration file]({{site.url}}{{site.baseurl}}/opensearch/configuration/#configuration-file), and dynamically [updated settings]({{site.url}}{{site.baseurl}}/opensearch/configuration/#update-cluster-settings-using-the-api). +settings | A node's settings. This is a combination of the default settings, custom settings from the [configuration file]({{site.url}}{{site.baseurl}}/install-and-configure/configuration/#configuration-file), and dynamically [updated settings]({{site.url}}{{site.baseurl}}/install-and-configure/configuration/#update-cluster-settings-using-the-api). os | Static information about the host OS, including version, processor architecture, and available/allocated processors. process | Contains the process ID. jvm | Detailed static information about the running JVM, including arguments. @@ -79,10 +79,10 @@ timeout | Time | Sets the time limit for node response. Default value is `30s`. #### Example request -The following query requests the `process` and `transport` metrics from the master node: +The following query requests the `process` and `transport` metrics from the cluster manager node: ```json -GET /_nodes/master:true/process,transport +GET /_nodes/cluster_manager:true/process,transport ``` {% include copy-curl.html %} diff --git a/_api-reference/nodes-apis/nodes-stats.md b/_api-reference/nodes-apis/nodes-stats.md index d11e1cc31b..ca6f9797e9 100644 --- a/_api-reference/nodes-apis/nodes-stats.md +++ b/_api-reference/nodes-apis/nodes-stats.md @@ -122,7 +122,7 @@ GET _nodes/stats/ "host" : "127.0.0.1", "ip" : "127.0.0.1:9300", "roles" : [ - "master", + "cluster_manager", "data", "ingest", "remote_cluster_client" @@ -161,19 +161,22 @@ GET _nodes/stats/ "current" : 0 }, "search" : { - "open_contexts" : 0, - "query_total" : 194, - "query_time_in_millis" : 467, - "query_current" : 0, - "fetch_total" : 194, - "fetch_time_in_millis" : 143, - "fetch_current" : 0, - "scroll_total" : 0, - "scroll_time_in_millis" : 0, - "scroll_current" : 0, - "suggest_total" : 0, - "suggest_time_in_millis" : 0, - "suggest_current" : 0 + "open_contexts": 4, + "query_total": 194, + "query_time_in_millis": 467, + "query_current": 0, + "fetch_total": 194, + "fetch_time_in_millis": 143, + "fetch_current": 0, + "scroll_total": 0, + "scroll_time_in_millis": 0, + "scroll_current": 0, + "point_in_time_total": 0, + "point_in_time_time_in_millis": 0, + "point_in_time_current": 0, + "suggest_total": 0, + "suggest_time_in_millis": 0, + "suggest_current": 0 }, "merges" : { "current" : 0, @@ -578,7 +581,7 @@ GET _nodes/stats/ The following table lists all response fields. -| Field | Data Type | Description | +| Field | Data type | Description | | :--- | :--- | :--- | | _nodes | Object | Statistics about the nodes that are returned. | | _nodes.total | Integer | The total number of nodes for this request. | @@ -591,14 +594,14 @@ The following table lists all response fields. The `nodes` object contains all nodes that are returned by the request, along with their IDs. Each node has the following properties. -Field | Data Type | Description +Field | Data type | Description :--- | :--- | :--- timestamp | Integer | The time the nodes statistics were collected, in milliseconds since the epoch. name | String | The name of the node. transport_address | IP address | The host and port of the transport layer that is used by nodes in a cluster to communicate internally. host | IP address | The network host of the node. ip | IP address | The IP address and port of the node. -roles | Array | The roles of the node (for example, `master`, `data`, or `ingest`). +roles | Array | The roles of the node (for example, `cluster_manager`, `data`, or `ingest`). attributes | Object | The attributes of the node (for example, `shard_indexing_pressure_enabled`). [indices](#indices) | Object | Index statistics for each index that has shards on the node. [os](#os) | Object | Statistics about the OS for the node. @@ -618,6 +621,7 @@ http.total_opened | Integer | The total number of HTTP connections the node has [adaptive_selection](#adaptive_selection) | Object | Statistics about adaptive selections for the node. [indexing_pressure](#indexing_pressure) | Object | Statistics related to the node's indexing pressure. [shard_indexing_pressure](#shard_indexing_pressure) | Object | Statistics related to indexing pressure at the shard level. +[search_backpressure]({{site.url}}{{site.baseurl}}/opensearch/search-backpressure#search-backpressure-stats-api) | Object | Statistics related to search backpressure. ### `indices` @@ -626,113 +630,116 @@ The `indices` object contains the index statistics for each index with shards on Field | Field type | Description :--- | :--- | :--- docs | Object | Document statistics for all primary shards that exist on the node. -docs.
    count | Integer | The number of documents reported by Lucene. Excludes deleted documents and recently indexed documents that are not yet assigned to a segment. Nested documents are counted separately. -docs.
    deleted | Integer | The number of deleted documents reported by Lucene. Excludes recent deletion operations that have not yet affect the segment. +docs.count | Integer | The number of documents reported by Lucene. Excludes deleted documents and recently indexed documents that are not yet assigned to a segment. Nested documents are counted separately. +docs.deleted | Integer | The number of deleted documents reported by Lucene. Excludes recent deletion operations that have not yet affect the segment. store | Object | Statistics about the shard sizes of the shards on the node. -store.
    size_in_bytes | Integer | Total size of all shards on the node. -store.
    reserved_in_bytes | Integer | The predicted number of bytes the shard store will grow to be because of activities such as restoring snapshots and peer recoveries. +store.size_in_bytes | Integer | Total size of all shards on the node. +store.reserved_in_bytes | Integer | The predicted number of bytes the shard store will grow to be because of activities such as restoring snapshots and peer recoveries. indexing | Object | Statistics about indexing operations for the node. -indexing.
    index_total | Integer | The total number of indexing operations on the node. -indexing.
    index_time_in_millis | Integer | The total time for all indexing operations, in milliseconds. -indexing.
    index_current | Integer | The number of indexing operations that are currently running. -indexing.
    index_failed | Integer | The number of indexing operations that have failed. -indexing.
    delete_total | Integer | The total number of deletions. -indexing.
    delete_time_in_millis | Integer | The total time for all deletion operations, in milliseconds. -indexing.
    delete_current | Integer | The number of deletion operations that are currently running. -indexing.
    noop_update_total | Integer | The total number of noop operations. -indexing.
    is_throttled | Boolean | Specifies whether any operations were throttled. -indexing.
    throttle_time_in_millis | Integer | The total time for throttling operations, in milliseconds. +indexing.index_total | Integer | The total number of indexing operations on the node. +indexing.index_time_in_millis | Integer | The total time for all indexing operations, in milliseconds. +indexing.index_current | Integer | The number of indexing operations that are currently running. +indexing.index_failed | Integer | The number of indexing operations that have failed. +indexing.delete_total | Integer | The total number of deletions. +indexing.delete_time_in_millis | Integer | The total time for all deletion operations, in milliseconds. +indexing.delete_current | Integer | The number of deletion operations that are currently running. +indexing.noop_update_total | Integer | The total number of noop operations. +indexing.is_throttled | Boolean | Specifies whether any operations were throttled. +indexing.throttle_time_in_millis | Integer | The total time for throttling operations, in milliseconds. get | Object | Statistics about the get operations for the node. -get.
    total | Integer | The total number of get operations. -get.
    time_in_millis | Integer | The total time for all get operations, in milliseconds. -get.
    exists_total | Integer | The total number of successful get operations. -get.
    exists_time_in_millis | Integer | The total time for all successful get operations, in milliseconds. -get.
    missing_total | Integer | The number of failed get operations. -get.
    missing_time_in_millis | Integer | The total time for all failed get operations, in milliseconds. -get.
    current | Integer | The number of get operations that are currently running. +get.total | Integer | The total number of get operations. +get.time_in_millis | Integer | The total time for all get operations, in milliseconds. +get.exists_total | Integer | The total number of successful get operations. +get.exists_time_in_millis | Integer | The total time for all successful get operations, in milliseconds. +get.missing_total | Integer | The number of failed get operations. +get.missing_time_in_millis | Integer | The total time for all failed get operations, in milliseconds. +get.current | Integer | The number of get operations that are currently running. search | Object | Statistics about the search operations for the node. -search.
    open_contexts | Integer | The number of open search contexts. -search.
    query_total | Integer | The total number of query operations. -search.
    query_time_in_millis | Integer | The total time for all query operations, in milliseconds. -search.
    query_current | Integer | The number of query operations that are currently running. -search.
    fetch_total | Integer | The total number of fetch operations. -search.
    fetch_time_in_millis | Integer | The total time for all fetch operations, in milliseconds. -search.
    fetch_current | Integer | The number of fetch operations that are currently running. -search.
    scroll_total | Integer | The total number of scroll operations. -search.
    scroll_time_in_millis | Integer | The total time for all scroll operations, in milliseconds. -search.
    scroll_current | Integer | The number of scroll operations that are currently running. -search.
    suggest_total | Integer | The total number of suggest operations. -search.
    suggest_time_in_millis | Integer | The total time for all suggest operations, in milliseconds. -search.
    suggest_current | Integer | The number of suggest operations that are currently running. +search.point_in_time_total | Integer | The total number of Point in Time contexts that have been created (completed and active) since the node last restarted. +search.point_in_time_time_in_millis | Integer | The amount of time that Point in Time contexts have been held open since the node last restarted, in milliseconds. +search.point_in_time_current | Integer | The number of Point in Time contexts currently open. +search.open_contexts | Integer | The number of open search contexts. +search.query_total | Integer | The total number of query operations. +search.query_time_in_millis | Integer | The total time for all query operations, in milliseconds. +search.query_current | Integer | The number of query operations that are currently running. +search.fetch_total | Integer | The total number of fetch operations. +search.fetch_time_in_millis | Integer | The total time for all fetch operations, in milliseconds. +search.fetch_current | Integer | The number of fetch operations that are currently running. +search.scroll_total | Integer | The total number of scroll operations. +search.scroll_time_in_millis | Integer | The total time for all scroll operations, in milliseconds. +search.scroll_current | Integer | The number of scroll operations that are currently running. +search.suggest_total | Integer | The total number of suggest operations. +search.suggest_time_in_millis | Integer | The total time for all suggest operations, in milliseconds. +search.suggest_current | Integer | The number of suggest operations that are currently running. merges | Object | Statistics about merge operations for the node. -merges.
    current | Integer | The number of merge operations that are currently running. -merges.
    current_docs | Integer | The number of document merges that are currently running. -merges.
    current_size_in_bytes | Integer | The memory size, in bytes, that is used to perform current merge operations. -merges.
    total | Integer | The total number of merge operations. -merges.
    total_time_in_millis | Integer | The total time for merges, in milliseconds. -merges.
    total_docs | Integer | The total number of documents that have been merged. -merges.
    total_size_in_bytes | Integer | The total size of all merged documents, in bytes. -merges.
    total_stopped_time_in_millis | Integer | The total time spent on stopping merge operations, in milliseconds. -merges.
    total_throttled_time_in_millis | Integer | The total time spent on throttling merge operations, in milliseconds. -merges.
    total_auto_throttle_in_bytes | Integer | The total size of automatically throttled merge operations, in bytes. +merges.current | Integer | The number of merge operations that are currently running. +merges.current_docs | Integer | The number of document merges that are currently running. +merges.current_size_in_bytes | Integer | The memory size, in bytes, that is used to perform current merge operations. +merges.total | Integer | The total number of merge operations. +merges.total_time_in_millis | Integer | The total time for merges, in milliseconds. +merges.total_docs | Integer | The total number of documents that have been merged. +merges.total_size_in_bytes | Integer | The total size of all merged documents, in bytes. +merges.total_stopped_time_in_millis | Integer | The total time spent on stopping merge operations, in milliseconds. +merges.total_throttled_time_in_millis | Integer | The total time spent on throttling merge operations, in milliseconds. +merges.total_auto_throttle_in_bytes | Integer | The total size of automatically throttled merge operations, in bytes. refresh | Object | Statistics about refresh operations for the node. -refresh.
    total | Integer | The total number of refresh operations. -refresh.
    total_time_in_millis | Integer | The total time for all refresh operations, in milliseconds. -refresh.
    external_total | Integer | The total number of external refresh operations. -refresh.
    external_total_time_in_millis | Integer | The total time for all external refresh operations, in milliseconds. -refresh.
    listeners | Integer | The number of refresh listeners. +refresh.total | Integer | The total number of refresh operations. +refresh.total_time_in_millis | Integer | The total time for all refresh operations, in milliseconds. +refresh.external_total | Integer | The total number of external refresh operations. +refresh.external_total_time_in_millis | Integer | The total time for all external refresh operations, in milliseconds. +refresh.listeners | Integer | The number of refresh listeners. flush | Object | Statistics about flush operations for the node. -flush.
    total | Integer | The total number of flush operations. -flush.
    periodic | Integer | The total number of periodic flush operations. -flush.
    total_time_in_millis | Integer | The total time for all flush operations, in milliseconds. +flush.total | Integer | The total number of flush operations. +flush.periodic | Integer | The total number of periodic flush operations. +flush.total_time_in_millis | Integer | The total time for all flush operations, in milliseconds. warmer | Object | Statistics about the index warming operations for the node. -warmer.
    current | Integer | The number of current index warming operations. -warmer.
    total | Integer | The total number of index warming operations. -warmer.
    total_time_in_millis | Integer | The total time for all index warming operations, in milliseconds. +warmer.current | Integer | The number of current index warming operations. +warmer.total | Integer | The total number of index warming operations. +warmer.total_time_in_millis | Integer | The total time for all index warming operations, in milliseconds. query_cache | Statistics about query cache operations for the node. -query_cache.
    memory_size_in_bytes | Integer | The amount of memory used for the query cache for all shards in the node. -query_cache.
    total_count | Integer | The total number of hits, misses, and cached queries in the query cache. -query_cache.
    hit_count | Integer | The total number of hits in the query cache. -query_cache.
    miss_count | Integer | The total number of misses in the query cache. -query_cache.
    cache_size | Integer | The size of the query cache, in bytes. -query_cache.
    cache_count | Integer | The number of queries in the query cache. -query_cache.
    evictions | Integer | The number of evictions in the query cache. +query_cache.memory_size_in_bytes | Integer | The amount of memory used for the query cache for all shards in the node. +query_cache.total_count | Integer | The total number of hits, misses, and cached queries in the query cache. +query_cache.hit_count | Integer | The total number of hits in the query cache. +query_cache.miss_count | Integer | The total number of misses in the query cache. +query_cache.cache_size | Integer | The size of the query cache, in bytes. +query_cache.cache_count | Integer | The number of queries in the query cache. +query_cache.evictions | Integer | The number of evictions in the query cache. fielddata | Object | Statistics about the field data cache for all shards in the node. -fielddata.
    memory_size_in_bytes | Integer | The total amount of memory used for the field data cache for all shards in the node. -fielddata.
    evictions | Integer | The number of evictions in the field data cache. -fielddata.
    fields | Object | Contains all field data fields. +fielddata.memory_size_in_bytes | Integer | The total amount of memory used for the field data cache for all shards in the node. +fielddata.evictions | Integer | The number of evictions in the field data cache. +fielddata.fields | Object | Contains all field data fields. completion | Object | Statistics about completions for all shards in the node. -completion.
    size_in_bytes | Integer | The total amount of memory used for completion for all shards in the node, in bytes. -completion.
    fields | Object | Contains completion fields. +completion.size_in_bytes | Integer | The total amount of memory used for completion for all shards in the node, in bytes. +completion.fields | Object | Contains completion fields. segments | Object | Statistics about segments for all shards in the node. -segments.
    count | Integer | The total number of segments. -segments.
    memory_in_bytes | Integer | The total amount of memory, in bytes. -segments.
    terms_memory_in_bytes | Integer | The total amount of memory used for terms, in bytes. -segments.
    stored_fields_memory_in_bytes | Integer | The total amount of memory used for stored fields, in bytes. -segments.
    term_vectors_memory_in_bytes | Integer | The total amount of memory used for term vectors, in bytes. -segments.
    norms_memory_in_bytes | Integer | The total amount of memory used for normalization factors, in bytes. -segments.
    points_memory_in_bytes | Integer | The total amount of memory used for points, in bytes. -segments.
    doc_values_memory_in_bytes | Integer | The total amount of memory used for doc values, in bytes. -segments.
    index_writer_memory_in_bytes | Integer | The total amount of memory used by all index writers, in bytes. -segments.
    version_map_memory_in_bytes | Integer | The total amount of memory used by all version maps, in bytes. -segments.
    fixed_bit_set_memory_in_bytes | Integer | The total amount of memory used by fixed bit sets, in bytes. Fixed bit sets are used for nested objects and join fields. -segments.
    max_unsafe_auto_id_timestamp | Integer | The timestamp for the most recently retired indexing request, in milliseconds since the epoch. -segments.
    file_sizes | Integer | Statistics about the size of the segment files. +segments.count | Integer | The total number of segments. +segments.memory_in_bytes | Integer | The total amount of memory, in bytes. +segments.terms_memory_in_bytes | Integer | The total amount of memory used for terms, in bytes. +segments.stored_fields_memory_in_bytes | Integer | The total amount of memory used for stored fields, in bytes. +segments.term_vectors_memory_in_bytes | Integer | The total amount of memory used for term vectors, in bytes. +segments.norms_memory_in_bytes | Integer | The total amount of memory used for normalization factors, in bytes. +segments.points_memory_in_bytes | Integer | The total amount of memory used for points, in bytes. +segments.doc_values_memory_in_bytes | Integer | The total amount of memory used for doc values, in bytes. +segments.index_writer_memory_in_bytes | Integer | The total amount of memory used by all index writers, in bytes. +segments.version_map_memory_in_bytes | Integer | The total amount of memory used by all version maps, in bytes. +segments.fixed_bit_set_memory_in_bytes | Integer | The total amount of memory used by fixed bit sets, in bytes. Fixed bit sets are used for nested objects and join fields. +segments.max_unsafe_auto_id_timestamp | Integer | The timestamp for the most recently retired indexing request, in milliseconds since the epoch. +segments.file_sizes | Integer | Statistics about the size of the segment files. translog | Object | Statistics about transaction log operations for the node. -translog.
    operations | Integer | The number of translog operations. -translog.
    size_in_bytes | Integer | The size of the translog, in bytes. -translog.
    uncommitted_operations | Integer | The number of uncommitted translog operations. -translog.
    uncommitted_size_in_bytes | Integer | The size of uncommitted translog operations, in bytes. -translog.
    earliest_last_modified_age | Integer | The earliest last modified age for the translog. +translog.operations | Integer | The number of translog operations. +translog.size_in_bytes | Integer | The size of the translog, in bytes. +translog.uncommitted_operations | Integer | The number of uncommitted translog operations. +translog.uncommitted_size_in_bytes | Integer | The size of uncommitted translog operations, in bytes. +translog.earliest_last_modified_age | Integer | The earliest last modified age for the translog. request_cache | Object | Statistics about the request cache for the node. -request_cache.
    memory_size_in_bytes | Integer | The memory size used by the request cache, in bytes. -request_cache.
    evictions | Integer | The number of request cache evictions. -request_cache.
    hit_count | Integer | The number of request cache hits. -request_cache.
    miss_count | Integer | The number of request cache misses. +request_cache.memory_size_in_bytes | Integer | The memory size used by the request cache, in bytes. +request_cache.evictions | Integer | The number of request cache evictions. +request_cache.hit_count | Integer | The number of request cache hits. +request_cache.miss_count | Integer | The number of request cache misses. recovery | Object | Statistics about recovery operations for the node. -recovery.
    current_as_source | Integer | The number of recovery operations that have used an index shard as a source. -recovery.
    current_as_target | Integer | The number of recovery operations that have used an index shard as a target. -recovery.
    throttle_time_in_millis | Integer | The delay of recovery operations due to throttling, in milliseconds. +recovery.current_as_source | Integer | The number of recovery operations that have used an index shard as a source. +recovery.current_as_target | Integer | The number of recovery operations that have used an index shard as a target. +recovery.throttle_time_in_millis | Integer | The delay of recovery operations due to throttling, in milliseconds. ### `os` @@ -749,19 +756,19 @@ cpu.load_average.1m | Float | The load average for the system for the time perio cpu.load_average.5m | Float | The load average for the system for the time period of five minutes. cpu.load_average.15m | Float | The load average for the system for the time period of 15 minutes. cpu.mem | Object | Statistics about memory usage for the node. -cpu.mem.    total_in_bytes | Integer | The total amount of physical memory, in bytes. -cpu.mem.    free_in_bytes | Integer | The total amount of free physical memory, in bytes. -cpu.mem.    used_in_bytes | Integer | The total amount of used physical memory, in bytes. -cpu.mem.    free_percent | Integer | The percentage of memory that is free. -cpu.mem.    used_percent | Integer | The percentage of memory that is used. +cpu.mem.total_in_bytes | Integer | The total amount of physical memory, in bytes. +cpu.mem.free_in_bytes | Integer | The total amount of free physical memory, in bytes. +cpu.mem.used_in_bytes | Integer | The total amount of used physical memory, in bytes. +cpu.mem.free_percent | Integer | The percentage of memory that is free. +cpu.mem.used_percent | Integer | The percentage of memory that is used. cpu.swap | Object | Statistics about swap space for the node. -cpu.swap.    total_in_bytes | Integer | The total amount of swap space, in bytes. +cpu.swap.total_in_bytes | Integer | The total amount of swap space, in bytes. cpu.swap.free_in_bytes | Integer | The total amount of free swap space, in bytes. -cpu.swap.    used_in_bytes | Integer | The total amount of used swap space, in bytes. +cpu.swap.used_in_bytes | Integer | The total amount of used swap space, in bytes. cpu.cgroup | Object | Contains cgroup statistics for the node. Returned for Linux only. cpu.cgroup.cpuacct | Object | Statistics about the cpuacct control group for the node. -cpu.cgroup.    cpu | Object | Statistics about the CPU control group for the node. -cpu.cgroup.    memory | Object | Statistics about the memory control group for the node. +cpu.cgroup.cpu | Object | Statistics about the CPU control group for the node. +cpu.cgroup.memory | Object | Statistics about the memory control group for the node. ### `process` @@ -948,12 +955,12 @@ The `indexing_pressure` object contains the indexing pressure statistics and has Field | Field type | Description :--- | :--- | :--- memory | Object | Statistics related to memory consumption for the indexing load. -memory.
    current | Object | Statistics related to memory consumption for the current indexing load. -memory.
    current.
        combined_coordinating_and_primary_in_bytes | Integer | The total memory used by indexing requests in the coordinating or primary stages, in bytes. A node can reuse the coordinating memory if the primary stage is run locally, so the total memory does not necessarily equal the sum of the coordinating and primary stage memory usage. -memory.
    current.
        coordinating_in_bytes | The total memory consumed by indexing requests in the coordinating stage, in bytes. -memory.
    current.
        primary_in_bytes | Integer | The total memory consumed by indexing requests in the primary stage, in bytes. -memory.
    current.
        replica_in_bytes | Integer | The total memory consumed by indexing requests in the replica stage, in bytes. -memory.
    current.
        all_in_bytes | Integer | The total memory consumed by indexing requests in the coordinating, primary, or replica stages. +memory.current | Object | Statistics related to memory consumption for the current indexing load. +memory.current.combined_coordinating_and_primary_in_bytes | Integer | The total memory used by indexing requests in the coordinating or primary stages, in bytes. A node can reuse the coordinating memory if the primary stage is run locally, so the total memory does not necessarily equal the sum of the coordinating and primary stage memory usage. +memory.current.coordinating_in_bytes | The total memory consumed by indexing requests in the coordinating stage, in bytes. +memory.current.primary_in_bytes | Integer | The total memory consumed by indexing requests in the primary stage, in bytes. +memory.current.replica_in_bytes | Integer | The total memory consumed by indexing requests in the replica stage, in bytes. +memory.current.all_in_bytes | Integer | The total memory consumed by indexing requests in the coordinating, primary, or replica stages. ### `shard_indexing_pressure` @@ -963,9 +970,9 @@ Field | Field type | Description :--- | :--- | :--- [stats]({{site.url}}{{site.baseurl}}/opensearch/stats-api/) | Object | Statistics about shard indexing pressure. total_rejections_breakup_shadow_mode | Object | If running in shadow mode, the `total_rejections_breakup_shadow_mode` object contains statistics about the request rejection criteria of all shards in the node. -total_rejections_breakup_shadow_mode.
    node_limits | Integer | The total number of rejections due to the node memory limit. When all shards reach the memory limit assigned to the node (for example, 10% of heap size), the shard is unable to take in more traffic on the node, and the indexing request is rejected. -total_rejections_breakup_shadow_mode.
    no_successful_request_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and the shard has multiple outstanding requests that are waiting to be executed. In this case, additional indexing requests are rejected until the system recovers. -total_rejections_breakup_shadow_mode.
    throughput_degradation_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and there is a constant deterioration in the request turnaround at the shard level. In this case, additional indexing requests are rejected until the system recovers. +total_rejections_breakup_shadow_mode.node_limits | Integer | The total number of rejections due to the node memory limit. When all shards reach the memory limit assigned to the node (for example, 10% of heap size), the shard is unable to take in more traffic on the node, and the indexing request is rejected. +total_rejections_breakup_shadow_mode.no_successful_request_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and the shard has multiple outstanding requests that are waiting to be executed. In this case, additional indexing requests are rejected until the system recovers. +total_rejections_breakup_shadow_mode.throughput_degradation_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and there is a constant deterioration in the request turnaround at the shard level. In this case, additional indexing requests are rejected until the system recovers. enabled | Boolean | Specifies whether the shard indexing pressure feature is turned on for the node. enforced | Boolean | If true, the shard indexing pressure runs in enforced mode (there are rejections). If false, the shard indexing pressure runs in shadow mode (there are no rejections, but statistics are recorded and can be retrieved in the `total_rejections_breakup_shadow_mode` object). Only applicable if shard indexing pressure is enabled. diff --git a/_api-reference/nodes-apis/nodes-usage.md b/_api-reference/nodes-apis/nodes-usage.md index e8385ec7e8..ef65b914f5 100644 --- a/_api-reference/nodes-apis/nodes-usage.md +++ b/_api-reference/nodes-apis/nodes-usage.md @@ -34,7 +34,7 @@ You can include the following optional query parameters in your request. Parameter | Type | Description :--- | :---| :--- timeout | Time | Sets the time limit for a response from the node. Default is `30s`. -master_timeout | Time | Sets the time limit for a response from the master node. Default is `30s`. +cluster_manager_timeout | Time | Sets the time limit for a response from the cluster manager. Default is `30s`. #### Example request diff --git a/_opensearch/popular-api.md b/_api-reference/popular-api.md similarity index 98% rename from _opensearch/popular-api.md rename to _api-reference/popular-api.md index 00b4e3ebf2..181a64a277 100644 --- a/_opensearch/popular-api.md +++ b/_api-reference/popular-api.md @@ -2,6 +2,8 @@ layout: default title: Popular APIs nav_order: 96 +redirect_from: + - /opensearch/popular-api/ --- # Popular APIs diff --git a/_api-reference/rank-eval.md b/_api-reference/rank-eval.md index f6493b378b..db12693b8e 100644 --- a/_api-reference/rank-eval.md +++ b/_api-reference/rank-eval.md @@ -19,7 +19,7 @@ POST /_rank_eval Query parameters are optional. -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- ignore_unavailable | Boolean | Defaults to `false`. When set to `false` the response body will return an error if an index is closed or missing. allow_no_indices | Boolean | Defaults to `true`. When set to `false` the response body will return an error if a wildcard expression points to indexes that are closed or missing. diff --git a/_api-reference/reload-search-analyzer.md b/_api-reference/reload-search-analyzer.md index 74407561b7..a07267d619 100644 --- a/_api-reference/reload-search-analyzer.md +++ b/_api-reference/reload-search-analyzer.md @@ -19,7 +19,7 @@ GET //_reload_search_analyzers Request body parameters are optional. -Field Type | Data Type | Description +Field Type | Data type | Description :--- | :--- | :--- allow_no_indices | Boolean | When set to `false`, an error is returned for indexes that are closed or missing and match any wildcard expression. Default is set to `true`. expand_wildcards | String | Allows you to set the wildcards that can be matched to a type of index. Available options are `open`, `closed`, `all`, `none`, and `hidden`. Default is set to `open`. diff --git a/_api-reference/script-apis/create-stored-script.md b/_api-reference/script-apis/create-stored-script.md index a5a45b6287..e782be23b9 100644 --- a/_api-reference/script-apis/create-stored-script.md +++ b/_api-reference/script-apis/create-stored-script.md @@ -13,12 +13,12 @@ For additional information about Painless scripting, see: * [k-NN Painless Scripting extensions]({{site.url}}{{site.baseurl}}/search-plugins/knn/painless-functions/). -* [k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/inswz/). +* [k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/). ### Path parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | script-id | String | Stored script or search template ID. Must be unique across the cluster. Required. | @@ -26,21 +26,21 @@ For additional information about Painless scripting, see: All parameters are optional. -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | context | String | Context in which the script or search template is to run. To prevent errors, the API immediately compiles the script or template in this context. | -| master_timeout | Time | Amount of time to wait for a connection to the master. Defaults to 30 seconds. | +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the cluster manager. Defaults to 30 seconds. | | timeout | Time | The period of time to wait for a response. If a response is not received before the timeout value, the request fails and returns an error. Defaults to 30 seconds.| ### Request fields -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | script | Object | Defines the script or search template, its parameters, and its language. See *Script object* below. | *Script object* -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | lang | String | Scripting language. Required. | | source | String or Object | Required.

For scripts, a string with the contents of the script.

For search templates, an object that defines the search template. Supports the same parameters as the [Search]({{site.url}}{{site.baseurl}}/api-reference/search) API request body. Search templates also support Mustache variables. | @@ -110,6 +110,6 @@ To determine whether the script was successfully created, use the [Get stored sc ### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | acknowledged | Boolean | whether the request was received. | \ No newline at end of file diff --git a/_api-reference/script-apis/delete-script.md b/_api-reference/script-apis/delete-script.md index 633162b478..a9cb3ca9c0 100644 --- a/_api-reference/script-apis/delete-script.md +++ b/_api-reference/script-apis/delete-script.md @@ -13,15 +13,15 @@ Deletes a stored script Path parameters are optional. -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | script-id | String | ID of script to delete. | ### Query parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- -| master_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to `30s`. | +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the cluster manager. Optional, defaults to `30s`. | | timeout | Time | The period of time to wait for a response. If a response is not received before the timeout value, the request will be dropped. #### Example request @@ -49,6 +49,6 @@ To determine whether the stored script was successfully deleted, use the [Get st The request returns the following response fields: -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | acknowledged | Boolean | Whether the delete script request was received. | \ No newline at end of file diff --git a/_api-reference/script-apis/exec-script.md b/_api-reference/script-apis/exec-script.md index 70080411af..18e18478da 100644 --- a/_api-reference/script-apis/exec-script.md +++ b/_api-reference/script-apis/exec-script.md @@ -2,7 +2,6 @@ layout: default title: Execute Painless script parent: Script APIs -grand_parent: REST API reference nav_order: 7 --- diff --git a/_api-reference/script-apis/exec-stored-script.md b/_api-reference/script-apis/exec-stored-script.md index 2433eb3dae..8fcaf8a350 100644 --- a/_api-reference/script-apis/exec-stored-script.md +++ b/_api-reference/script-apis/exec-stored-script.md @@ -13,7 +13,7 @@ OpenSearch provides several ways to run a script; the following sections show ho ### Request fields -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | query | Object | A filter that specifies documents to process. | | script_fields | Object | Fields to include in output. | @@ -104,7 +104,7 @@ The `GET books/_search` request returns the following fields: ### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | took | Integer | How long the operation took in milliseconds. | | timed_out | Boolean | Whether the operation timed out. | @@ -113,7 +113,7 @@ The `GET books/_search` request returns the following fields: #### Hits object -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | total | Object | Total number of documents processed and their relationship to the `match` request field. | | max_score | Double | Highest relevance score returned from all the hits. | @@ -121,7 +121,7 @@ The `GET books/_search` request returns the following fields: #### Document object -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | _index | String | Index that contains the document. | | _id | String | Document ID. | diff --git a/_api-reference/script-apis/get-script-contexts.md b/_api-reference/script-apis/get-script-contexts.md index cec9be829c..f9ca8052c8 100644 --- a/_api-reference/script-apis/get-script-contexts.md +++ b/_api-reference/script-apis/get-script-contexts.md @@ -549,20 +549,20 @@ The `GET _script_context` request returns the following fields: The `GET _script_context` request returns the following response fields: -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | contexts | List | A list of all contexts. See [Script object](#script-context). | #### Script context -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | name | String | The context name. | | methods | List | List of the context's allowable methods. See [Script object](#context-methods). | #### Context methods -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | name | String | Method name. | | name | String | Type that the method returns (`boolean`, `object`, `number`, and so on). | @@ -570,7 +570,7 @@ The `GET _script_context` request returns the following response fields: #### Method parameters -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | type | String | Parameter data type. | | name | String | Parameter name. | \ No newline at end of file diff --git a/_api-reference/script-apis/get-script-language.md b/_api-reference/script-apis/get-script-language.md index 32d8bc1f71..6fb566b751 100644 --- a/_api-reference/script-apis/get-script-language.md +++ b/_api-reference/script-apis/get-script-language.md @@ -91,7 +91,7 @@ The `GET _script_language` request returns the available contexts for each langu The request contains the following response fields. -Field | Data Type | Description | +Field | Data type | Description | :--- | :--- | :--- types_allowed | List of strings | The types of scripts that are enabled, determined by the `script.allowed_types` setting. May contain `inline` and/or `stored`. language_contexts | List of objects | A list of objects, each of which maps a supported language to its available contexts. diff --git a/_api-reference/script-apis/get-stored-script.md b/_api-reference/script-apis/get-stored-script.md index ad958205b7..374222623e 100644 --- a/_api-reference/script-apis/get-stored-script.md +++ b/_api-reference/script-apis/get-stored-script.md @@ -11,15 +11,15 @@ Retrieves a stored script. ### Path parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | script | String | Stored script or search template name. Required.| ### Query parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- -| master_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to `30s`. | +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the cluster manager. Optional, defaults to `30s`. | #### Example request @@ -55,7 +55,7 @@ The `GET _scripts/my-first-script` request returns the following fields: The `GET _scripts/my-first-script` request returns the following response fields: -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | _id | String | The script's name. | | found | Boolean | The requested script exists and was retrieved. | @@ -63,7 +63,7 @@ The `GET _scripts/my-first-script` request returns the following response fields #### Script object -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | lang | String | The script's language. | | source | String | The script's body. | \ No newline at end of file diff --git a/_api-reference/search.md b/_api-reference/search.md index 5f7bff1a8e..4422afa50f 100644 --- a/_api-reference/search.md +++ b/_api-reference/search.md @@ -45,7 +45,7 @@ allow_partial_search_results | Boolean | Whether to return partial results if th analyzer | String | Analyzer to use in the query string. analyze_wildcard | Boolean | Whether the update operation should include wildcard and prefix queries in the analysis. Default is false. batched_reduce_size | Integer | How many shard results to reduce on a node. Default is 512. -cancel_after_time_interval | Time | The time after which the search request will be canceled. Request-level parameter takes precedence over cancel_after_time_interval [cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-settings). Default is -1. +cancel_after_time_interval | Time | The time after which the search request will be canceled. Request-level parameter takes precedence over cancel_after_time_interval [cluster setting]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings). Default is -1. ccs_minimize_roundtrips | Boolean | Whether to minimize roundtrips between a node and remote clusters. Default is true. default_operator | String | Indicates whether the default operator for a string query should be AND or OR. Default is OR. df | String | The default field in case a field prefix is not provided in the query string. diff --git a/_api-reference/snapshots/create-repository.md b/_api-reference/snapshots/create-repository.md index a0672f8caa..b531c7dba3 100644 --- a/_api-reference/snapshots/create-repository.md +++ b/_api-reference/snapshots/create-repository.md @@ -11,8 +11,8 @@ You can register a new repository in which to store snapshots or update informat There are two types of snapshot repositories: -* File system (`fs`): For instructions on creating an `fs` repository, see [Register repository shared file system]({{site.url}}{{site.baseurl}}/availability-and-recovery/snapshots/snapshot-restore/#shared-file-system). -* Amazon Simple Storage Service (Amazon S3) bucket (`s3`): For instructions on creating an `s3` repository, see [Register repository Amazon S3]({{site.url}}{{site.baseurl}}/availability-and-recovery/snapshots/snapshot-restore/#amazon-s3). +* File system (`fs`): For instructions on creating an `fs` repository, see [Register repository shared file system]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore/#shared-file-system). +* Amazon Simple Storage Service (Amazon S3) bucket (`s3`): For instructions on creating an `s3` repository, see [Register repository Amazon S3]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore/#amazon-s3). For instructions on creating a repository, see [Register repository]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#register-repository). diff --git a/_api-reference/snapshots/create-snapshot.md b/_api-reference/snapshots/create-snapshot.md index f70a83da6a..e41dd5d0a3 100644 --- a/_api-reference/snapshots/create-snapshot.md +++ b/_api-reference/snapshots/create-snapshot.md @@ -13,16 +13,23 @@ Creates a snapshot within an existing repository. * To view a list of your repositories, see [Get snapshot repository]({{site.url}}{{site.baseurl}}/api-reference/snapshots/get-snapshot-repository). +### Path and HTTP methods + +```json +PUT /_snapshot// +POST /_snapshot// +``` + ### Path parameters -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- repository | String | Repostory name to contain the snapshot. | snapshot | String | Name of Snapshot to create. | ### Query parameters -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- wait_for_completion | Boolean | Whether to wait for snapshot creation to complete before continuing. If you include this parameter, the snapshot definition is returned after completion. | @@ -30,7 +37,7 @@ wait_for_completion | Boolean | Whether to wait for snapshot creation to comple The request body is optional. -Field | Data Type | Description +Field | Data type | Description :--- | :--- | :--- `indices` | String | The indices you want to include in the snapshot. You can use `,` to create a list of indices, `*` to specify an index pattern, and `-` to exclude certain indices. Don't put spaces between items. Default is all indices. `ignore_unavailable` | Boolean | If an index from the `indices` list doesn't exist, whether to ignore it rather than fail the snapshot. Default is false. @@ -118,7 +125,7 @@ The snapshot definition is returned. #### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | | :--- | :--- | :--- | | snapshot | string | Snapshot name. | | uuid | string | Snapshot's universally unique identifier (UUID). | diff --git a/_api-reference/snapshots/delete-snapshot-repository.md b/_api-reference/snapshots/delete-snapshot-repository.md index 5dfafbdc24..5f4d446244 100644 --- a/_api-reference/snapshots/delete-snapshot-repository.md +++ b/_api-reference/snapshots/delete-snapshot-repository.md @@ -15,7 +15,7 @@ nav_order: 3 ### Path parameters -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- repository | String | Repository to delete. | diff --git a/_api-reference/snapshots/delete-snapshot.md b/_api-reference/snapshots/delete-snapshot.md index 58214174e5..4dd25c49f1 100644 --- a/_api-reference/snapshots/delete-snapshot.md +++ b/_api-reference/snapshots/delete-snapshot.md @@ -17,7 +17,7 @@ Deletes a snapshot from a repository. ### Path parameters -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- repository | String | Repostory that contains the snapshot. | snapshot | String | Snapshot to delete. | diff --git a/_api-reference/snapshots/get-snapshot-repository.md b/_api-reference/snapshots/get-snapshot-repository.md index 14a7059d2f..2ebe6921f7 100644 --- a/_api-reference/snapshots/get-snapshot-repository.md +++ b/_api-reference/snapshots/get-snapshot-repository.md @@ -16,16 +16,16 @@ You can also get details about a snapshot during and after snapshot creation. Se ### Path parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | | :--- | :--- | :--- | | repository | String | A comma-separated list of snapshot repository names to retrieve. Wildcard (`*`) expressions are supported including combining wildcards with exclude patterns starting with `-`. | ### Query parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | local | Boolean | Whether to get information from the local node. Optional, defaults to `false`.| -| master_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to 30 seconds. | +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to 30 seconds. | #### Example request @@ -54,7 +54,7 @@ Upon success, the response returns repositry information. This sample is for an ### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | | :--- | :--- | :--- | | type | string | Bucket type: `fs` (file system) or `s3` (s3 bucket) | | bucket | string | S3 bucket name. | diff --git a/_api-reference/snapshots/get-snapshot-status.md b/_api-reference/snapshots/get-snapshot-status.md index d2096221f5..0cd7886aec 100644 --- a/_api-reference/snapshots/get-snapshot-status.md +++ b/_api-reference/snapshots/get-snapshot-status.md @@ -18,7 +18,7 @@ If you use the security plugin, you must have the `monitor_snapshot`, `create_sn Path parameters are optional. -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | repository | String | Repository containing the snapshot. | | snapshot | String | Snapshot to return. | @@ -36,7 +36,7 @@ Using the API to return state for other than currently running snapshots can be ### Request fields -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | ignore_unavailable | Boolean | How to handles requests for unavailable snapshots. If `false`, the request returns an error for unavailable snapshots. If `true`, the request ignores unavailable snapshots, such as those that are corrupted or temporarily cannot be returned. Defaults to `false`.| @@ -369,7 +369,7 @@ The `GET _snapshot/my-opensearch-repo/my-first-snapshot/_status` request returns ### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | repository | String | Name of repository that contains the snapshot. | | snapshot | String | Snapshot name. | diff --git a/_api-reference/snapshots/get-snapshot.md b/_api-reference/snapshots/get-snapshot.md index 288c6402e2..1b875e0964 100644 --- a/_api-reference/snapshots/get-snapshot.md +++ b/_api-reference/snapshots/get-snapshot.md @@ -11,14 +11,14 @@ Retrieves information about a snapshot. ### Path parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | | :--- | :--- | :--- | | repository | String | The repository that contains the snapshot to retrieve. | | snapshot | String | Snapshot to retrieve. ### Query parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | verbose | Boolean | Whether to show all, or just basic snapshot information. If `true`, returns all information. If `false`, omits information like start/end times, failures, and shards. Optional, defaults to `true`.| | ignore_unavailable | Boolean | How to handle snapshots that are unavailable (corrupted or otherwise temporarily can't be returned). If `true` and the snapshot is unavailable, the request does not return the snapshot. If `false` and the snapshot is unavailable, the request returns an error. Optional, defaults to `false`.| @@ -73,7 +73,7 @@ Upon success, the response returns snapshot information: ```` ### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | | :--- | :--- | :--- | | snapshot | string | Snapshot name. | | uuid | string | Snapshot's universally unique identifier (UUID). | diff --git a/_api-reference/snapshots/restore-snapshot.md b/_api-reference/snapshots/restore-snapshot.md index c65bf05d99..1322933e73 100644 --- a/_api-reference/snapshots/restore-snapshot.md +++ b/_api-reference/snapshots/restore-snapshot.md @@ -16,17 +16,16 @@ Restores a snapshot of a cluster or specified data streams and indices. If open indices with the same name that you want to restore already exist in the cluster, you must close, delete, or rename the indices. See [Sample Request](#example-request) for information about renaming an index. See [Close index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index) for information about closing an index. {: .note} - ### Path parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- repository | String | Repository containing the snapshot to restore. | | snapshot | String | Snapshot to restore. | ### Query parameters -Parameter | Data Type | Description +Parameter | Data type | Description :--- | :--- | :--- wait_for_completion | Boolean | Whether to wait for snapshot restoration to complete before continuing. | @@ -34,7 +33,7 @@ wait_for_completion | Boolean | Whether to wait for snapshot restoration to com All request body parameters are optional. -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | ignore_unavailable | Boolean | How to handle data streams or indices that are missing or closed. If `false`, the request returns an error for any data stream or index that is missing or closed. If `true`, the request ignores data streams and indices in indices that are missing or closed. Defaults to `false`. | | ignore_index_settings | Boolean | A comma-delimited list of index settings that you don't want to restore from a snapshot. | @@ -91,7 +90,7 @@ Except for the snapshot name, all properties are empty or `0`. This is because a ### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | | :--- | :--- | :--- | | snapshot | string | Snapshot name. | | indices | array | Indices in the snapshot. | diff --git a/_api-reference/snapshots/verify-snapshot-repository.md b/_api-reference/snapshots/verify-snapshot-repository.md index e18d0e6726..a84f4325a6 100644 --- a/_api-reference/snapshots/verify-snapshot-repository.md +++ b/_api-reference/snapshots/verify-snapshot-repository.md @@ -19,15 +19,15 @@ If you use the security plugin, you must have the `manage cluster` privilege. Path parameters are optional. -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- | repository | String | Name of repository to verify. | ### Query parameters -| Parameter | Data Type | Description | +| Parameter | Data type | Description | :--- | :--- | :--- -| master_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to `30s`. | +| cluster_manager_timeout | Time | Amount of time to wait for a connection to the master node. Optional, defaults to `30s`. | | timeout | Time | The period of time to wait for a response. If a response is not received before the timeout value, the request fails and returns an error. Defaults to `30s`. | #### Example request @@ -35,14 +35,14 @@ Path parameters are optional. The following request verifies that the my-opensearch-repo is functional: ````json -POST /_snapshot/my-opensearch-repo/_verify?timeout=0s&master_timeout=50s +POST /_snapshot/my-opensearch-repo/_verify?timeout=0s&cluster_manager_timeout=50s ```` #### Example response The example that follows corresponds to the request above in the [Example request](#example-request) section. -The `POST /_snapshot/my-opensearch-repo/_verify?timeout=0s&master_timeout=50s` request returns the following fields: +The `POST /_snapshot/my-opensearch-repo/_verify?timeout=0s&cluster_manager_timeout=50s` request returns the following fields: ````json { @@ -70,6 +70,6 @@ In the preceding sample, one node is connected to the snapshot repository. If mo ### Response fields -| Field | Data Type | Description | +| Field | Data type | Description | :--- | :--- | :--- | nodes | Object | A list (not an array) of nodes connected to the snapshot repository. Each node itself is a property where the node ID is the key and the name has an ID (Object) and a name (String). | \ No newline at end of file diff --git a/_api-reference/tasks.md b/_api-reference/tasks.md index 260799d89c..54dbe62a22 100644 --- a/_api-reference/tasks.md +++ b/_api-reference/tasks.md @@ -51,6 +51,7 @@ Note that if a task finishes running, it won't be returned as part of your reque "running_time_in_nanos": 994000, "cancellable": false, "headers": {} + } }, "Mgqdm0r9SEGClWxp_RbnaQ:17413": { "node": "Mgqdm0r9SEGClWxp_RbnaQ", @@ -145,6 +146,61 @@ GET /_tasks?nodes=opensearch-node1 } ``` +The following request will return detailed information about active search tasks: + +**Sample Request** + +```bash +curl -XGET "localhost:9200/_tasks?actions=*search&detailed +``` +{% include copy.html %} + +**Sample Response** + +```json +{ + "nodes" : { + "CRqNwnEeRXOjeTSYYktw-A" : { + "name" : "runTask-0", + "transport_address" : "127.0.0.1:9300", + "host" : "127.0.0.1", + "ip" : "127.0.0.1:9300", + "roles" : [ + "cluster_manager", + "data", + "ingest", + "remote_cluster_client" + ], + "attributes" : { + "testattr" : "test", + "shard_indexing_pressure_enabled" : "true" + }, + "tasks" : { + "CRqNwnEeRXOjeTSYYktw-A:677" : { + "node" : "CRqNwnEeRXOjeTSYYktw-A", + "id" : 677, + "type" : "transport", + "action" : "indices:data/read/search", + "description" : "indices[], search_type[QUERY_THEN_FETCH], source[{\"query\":{\"query_string\":}}]", + "start_time_in_millis" : 1660106254525, + "running_time_in_nanos" : 1354236, + "cancellable" : true, + "cancelled" : false, + "headers" : { }, + "resource_stats" : { + "total" : { + "cpu_time_in_nanos" : 0, + "memory_in_bytes" : 0 + } + } + } + } + } + } +} + +``` + ## Task canceling After getting a list of tasks, you can cancel all cancelable tasks with the following request: @@ -233,7 +289,6 @@ content-length: 768 } } ``` - This operation supports the same parameters as the `tasks` operation. The following example shows how you can associate `X-Opaque-Id` with specific tasks: ```bash diff --git a/_opensearch/units.md b/_api-reference/units.md similarity index 97% rename from _opensearch/units.md rename to _api-reference/units.md index 0e098cfe39..1b9a3d07c9 100644 --- a/_opensearch/units.md +++ b/_api-reference/units.md @@ -2,6 +2,8 @@ layout: default title: Supported units nav_order: 90 +redirect_from: + - /opensearch/units/ --- # Supported units diff --git a/_clients/data-prepper/data-prepper-reference.md b/_clients/data-prepper/data-prepper-reference.md deleted file mode 100644 index f1472083fe..0000000000 --- a/_clients/data-prepper/data-prepper-reference.md +++ /dev/null @@ -1,439 +0,0 @@ ---- -layout: default -title: Configuration reference -parent: Data Prepper -nav_order: 3 ---- - -# Data Prepper configuration reference - -This page lists all supported Data Prepper server, sources, buffers, processors, and sinks, along with their associated options. For example configuration files, see [Data Prepper]({{site.url}}{{site.baseurl}}/clients/data-prepper/pipelines/). - -## Data Prepper server options - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -ssl | No | Boolean | Indicates whether TLS should be used for server APIs. Defaults to true. -keyStoreFilePath | No | String | Path to a .jks or .p12 keystore file. Required if `ssl` is true. -keyStorePassword | No | String | Password for keystore. Optional, defaults to empty string. -privateKeyPassword | No | String | Password for private key within keystore. Optional, defaults to empty string. -serverPort | No | Integer | Port number to use for server APIs. Defaults to 4900 -metricRegistries | No | List | Metrics registries for publishing the generated metrics. Currently supports Prometheus and CloudWatch. Defaults to Prometheus. - -## General pipeline options - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -workers | No | Integer | Essentially the number of application threads. As a starting point for your use case, try setting this value to the number of CPU cores on the machine. Default is 1. -delay | No | Integer | Amount of time in milliseconds workers wait between buffer read attempts. Default is 3,000. - - -## Sources - -Sources define where your data comes from. - - -### otel_trace_source - -Source for the OpenTelemetry Collector. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -port | No | Integer | The port OTel trace source is running on. Default is `21890`. -request_timeout | No | Integer | The request timeout in milliseconds. Default is `10_000`. -health_check_service | No | Boolean | Enables a gRPC health check service under `grpc.health.v1/Health/Check`. Default is `false`. -proto_reflection_service | No | Boolean | Enables a reflection service for Protobuf services (see [gRPC reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) and [gRPC Server Reflection Tutorial](https://github.com/grpc/grpc-java/blob/master/documentation/server-reflection-tutorial.md) docs). Default is `false`. -unframed_requests | No | Boolean | Enable requests not framed using the gRPC wire protocol. -thread_count | No | Integer | The number of threads to keep in the ScheduledThreadPool. Default is `200`. -max_connection_count | No | Integer | The maximum allowed number of open connections. Default is `500`. -ssl | No | Boolean | Enables connections to the OTel source port over TLS/SSL. Defaults to `true`. -sslKeyCertChainFile | Conditionally | String | File-system path or AWS S3 path to the security certificate (e.g. `"config/demo-data-prepper.crt"` or `"s3://my-secrets-bucket/demo-data-prepper.crt"`). Required if `ssl` is set to `true`. -sslKeyFile | Conditionally | String | File-system path or AWS S3 path to the security key (e.g. `"config/demo-data-prepper.key"` or `"s3://my-secrets-bucket/demo-data-prepper.key"`). Required if `ssl` is set to `true`. -useAcmCertForSSL | No | Boolean | Whether to enable TLS/SSL using certificate and private key from AWS Certificate Manager (ACM). Default is `false`. -acmCertificateArn | Conditionally | String | Represents the ACM certificate ARN. ACM certificate take preference over S3 or local file system certificate. Required if `useAcmCertForSSL` is set to `true`. -awsRegion | Conditionally | String | Represents the AWS region to use ACM or S3. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` and `sslKeyFile` are AWS S3 paths. -authentication | No | Object | An authentication configuration. By default, an unauthenticated server is created for the pipeline. This parameter uses pluggable authentication for HTTPS. To use basic authentication, define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [GrpcAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/GrpcAuthenticationProvider.java). -record_type | No | String | A string represents the supported record data type that is written into the buffer plugin. Value options are `otlp` or `event`. Default is `otlp`. -`otlp` | No | String | Otel-trace-source writes each incoming `ExportTraceServiceRequest` request as record data type into the buffer. -`event` | No | String | Otel-trace-source decodes each incoming `ExportTraceServiceRequest` request into a collection of Data Prepper internal spans serving as buffer items. To achieve better performance in this mode, we recommend setting buffer capacity proportional to the estimated number of spans in the incoming request payload. - -### http_source - -This is a source plugin that supports HTTP protocol. Currently ONLY support Json UTF-8 codec for incoming request, e.g. `[{"key1": "value1"}, {"key2": "value2"}]`. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -port | No | Integer | The port the source is running on. Default is `2021`. Valid options are between `0` and `65535`. -request_timeout | No | Integer | The request timeout in millis. Default is `10_000`. -thread_count | No | Integer | The number of threads to keep in the ScheduledThreadPool. Default is `200`. -max_connection_count | No | Integer | The maximum allowed number of open connections. Default is `500`. -max_pending_requests | No | Integer | The maximum number of allowed tasks in ScheduledThreadPool work queue. Default is `1024`. -authentication | No | Object | An authentication configuration. By default, this creates an unauthenticated server for the pipeline. This uses pluggable authentication for HTTPS. To use basic authentication define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [ArmeriaHttpAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/ArmeriaHttpAuthenticationProvider.java). - -### otel_metrics_source - -Source for the OpenTelemetry Collector for collecting metric data. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -port | No | Integer | The port OTel metrics source is running on. Default is `21891`. -request_timeout | No | Integer | The request timeout in milliseconds. Default is `10_000`. -health_check_service | No | Boolean | Enables a gRPC health check service under `grpc.health.v1/Health/Check`. Default is `false`. -proto_reflection_service | No | Boolean | Enables a reflection service for Protobuf services (see [gRPC reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) and [gRPC Server Reflection Tutorial](https://github.com/grpc/grpc-java/blob/master/documentation/server-reflection-tutorial.md) docs). Default is `false`. -unframed_requests | No | Boolean | Enable requests not framed using the gRPC wire protocol. -thread_count | No | Integer | The number of threads to keep in the ScheduledThreadPool. Default is `200`. -max_connection_count | No | Integer | The maximum allowed number of open connections. Default is `500`. -ssl | No | Boolean | Enables connections to the OTel source port over TLS/SSL. Defaults to `true`. -sslKeyCertChainFile | Conditionally | String | File-system path or AWS S3 path to the security certificate (e.g. `"config/demo-data-prepper.crt"` or `"s3://my-secrets-bucket/demo-data-prepper.crt"`). Required if `ssl` is set to `true`. -sslKeyFile | Conditionally | String | File-system path or AWS S3 path to the security key (e.g. `"config/demo-data-prepper.key"` or `"s3://my-secrets-bucket/demo-data-prepper.key"`). Required if `ssl` is set to `true`. -useAcmCertForSSL | No | Boolean | Whether to enable TLS/SSL using certificate and private key from AWS Certificate Manager (ACM). Default is `false`. -acmCertificateArn | Conditionally | String | Represents the ACM certificate ARN. ACM certificate take preference over S3 or local file system certificates. Required if `useAcmCertForSSL` is set to `true`. -awsRegion | Conditionally | String | Represents the AWS Region to use ACM or S3. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` and `sslKeyFile` are AWS S3 paths. -authentication | No | Object | An authentication configuration. By default, an unauthenticated server is created for the pipeline. This uses pluggable authentication for HTTPS. To use basic authentication, define the `http_basic` plugin with a `username` and `password`. To provide customer authentication, use or create a plugin that implements [GrpcAuthenticationProvider](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/armeria-common/src/main/java/com/amazon/dataprepper/armeria/authentication/GrpcAuthenticationProvider.java). - - -### s3 - -This is a source plugin that reads events from [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3) objects. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -notification_type | Yes | String | Must be `sqs` -compression | No | String | The compression algorithm to apply: `none`, `gzip`, or `automatic`. Default is `none`. -codec | Yes | Codec | The codec to apply. Must be either `newline` or `json`. -sqs | Yes | sqs | The [Amazon Simple Queue Service](https://aws.amazon.com/sqs/) (Amazon SQS) configuration. See [sqs](#s3-source-sqs) for details. -aws | Yes | aws | The AWS configuration. See [aws](#s3-source-aws) for details. -on_error | No | String | Determines how to handle errors in Amazon SQS. Can be either `retain_messages` or `delete_messages`. If `retain_messages`, then Data Prepper will leave the message in the SQS queue and try again. This is recommended for dead-letter queues. If `delete_messages`, then Data Prepper will delete failed messages. Default is `retain_messages`. -buffer_timeout | No | Duration | The timeout for writing events to the Data Prepper buffer. Any events that the S3 Source cannot write to the buffer in this time will be discarded. Default is 10 seconds. -records_to_accumulate | No | Integer | The number of messages that accumulate before writing to the buffer. Default is 100. -disable_bucket_ownership_validation | No | Boolean | If `true`, then the S3 Source will not attempt to validate that the bucket is owned by the expected account. The only expected account is the same account that owns the SQS queue. Defaults to `false`. - -#### sqs - -The following are configure usage of Amazon SQS in the S3 Source plugin. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -queue_url | Yes | String | The URL of the Amazon SQS queue from which messages are received. -maximum_messages | No | Integer | The maximum number of messages to receive from the SQS queue in any single request. Default is `10`. -visibility_timeout | No | Duration | The visibility timeout to apply to messages read from the SQS queue. This should be set to the amount of time that Data Prepper may take to read all the S3 objects in a batch. Default is `30s`. -wait_time | No | Duration | The time to wait for long polling on the SQS API. Default is `20s`. -poll_delay | No | Duration | A delay to place between reading and processing a batch of SQS messages and making a subsequent request. Default is `0s`. - - -#### aws - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -region | No | String | The AWS Region to use for credentials. Defaults to [standard SDK behavior to determine the Region](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/region-selection.html). -sts_role_arn | No | String | The AWS Security Token Service (AWS STS) role to assume for requests to Amazon SQS and Amazon S3. Defaults to null, which will use the [standard SDK behavior for credentials](https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/credentials.html). - - -### file - -Source for flat file input. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -path | Yes | String | Path to the input file (e.g. `logs/my-log.log`). -format | No | String | Format of each line in the file. Valid options are `json` or `plain`. Default is `plain`. -record_type | No | String | The record type to store. Valid options are `string` or `event`. Default is `string`. If you would like to use the file source for log analytics use cases like grok, set this option to `event`. - -### pipeline - -Source for reading from another pipeline. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -name | Yes | String | Name of the pipeline to read from. - - -### stdin - -Source for console input. Can be useful for testing. No options. - - -## Buffers - -Buffers store data as it passes through the pipeline. If you implement a custom buffer, it can be memory-based (better performance) or disk-based (larger). - - -### bounded_blocking - -The default buffer. Memory-based. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -buffer_size | No | Integer | The maximum number of records the buffer accepts. Default is 512. -batch_size | No | Integer | The maximum number of records the buffer drains after each read. Default is 8. - - -## Processors - -Processors perform some action on your data: filter, transform, enrich, etc. - -Prior to Data Prepper 1.3, Processors were named Preppers. Starting in Data Prepper 1.3, the term Prepper is deprecated in favor of Processor. Data Prepper will continue to support the term "Prepper" until 2.0, where it will be removed. -{: .note } - - -### otel_trace_raw_prepper - -Converts OpenTelemetry data to OpenSearch-compatible JSON documents and fills in trace group related fields in those JSON documents. It requires `record_type` to be set as `otlp` in `otel_trace_source`. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -trace_flush_interval | No | Integer | Represents the time interval in seconds to flush all the descendant spans without any root span. Default is 180. - -### otel_trace_raw - -This processor is a Data Prepper event record type compatible version of `otel_trace_raw_prepper` that fills in trace group related fields into all incoming Data Prepper span records. It requires `record_type` to be set as `event` in `otel_trace_source`. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -trace_flush_interval | No | Integer | Represents the time interval in seconds to flush all the descendant spans without any root span. Default is 180. - -### service_map_stateful - -Uses OpenTelemetry data to create a distributed service map for visualization in OpenSearch Dashboards. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -window_duration | No | Integer | Represents the fixed time window in seconds to evaluate service-map relationships. Default is 180. - -### peer_forwarder - -Forwards ExportTraceServiceRequests via gRPC to other Data Prepper instances. Required for operating Data Prepper in a clustered deployment. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -time_out | No | Integer | Forwarded request timeout in seconds. Defaults to 3 seconds. -span_agg_count | No | Integer | Batch size for number of spans per request. Defaults to 48. -target_port | No | Integer | The destination port to forward requests to. Defaults to `21890`. -discovery_mode | No | String | Peer discovery mode to be used. Allowable values are `static`, `dns`, and `aws_cloud_map`. Defaults to `static`. -static_endpoints | No | List | List containing string endpoints of all Data Prepper instances. -domain_name | No | String | Single domain name to query DNS against. Typically used by creating multiple DNS A Records for the same domain. -ssl | No | Boolean | Indicates whether to use TLS. Default is true. -awsCloudMapNamespaceName | Conditionally | String | Name of your CloudMap Namespace. Required if `discovery_mode` is set to `aws_cloud_map`. -awsCloudMapServiceName | Conditionally | String | Service name within your CloudMap Namespace. Required if `discovery_mode` is set to `aws_cloud_map`. -sslKeyCertChainFile | Conditionally | String | Represents the SSL certificate chain file path or AWS S3 path. S3 path example `s3:///`. Required if `ssl` is set to `true`. -useAcmCertForSSL | No | Boolean | Enables TLS/SSL using certificate and private key from AWS Certificate Manager (ACM). Default is `false`. -awsRegion | Conditionally | String | Represents the AWS Region to use ACM, S3, or CloudMap. Required if `useAcmCertForSSL` is set to `true` or `sslKeyCertChainFile` and `sslKeyFile` are AWS S3 paths. -acmCertificateArn | Conditionally | String | Represents the ACM certificate ARN. ACM certificate take preference over S3 or local file system certificate. Required if `useAcmCertForSSL` is set to `true`. - -### string_converter - -Converts string to uppercase or lowercase. Mostly useful as an example if you want to develop your own processor. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -upper_case | No | Boolean | Whether to convert to uppercase (`true`) or lowercase (`false`). - -### aggregate - -Groups events together based on the keys provided and performs a action on each group. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -identification_keys | Yes | List | A unordered list by which to group Events. Events with the same values for these keys are put into the same group. If an Event does not contain one of the `identification_keys`, then the value of that key is considered to be equal to `null`. At least one identification_key is required. (e.g. `["sourceIp", "destinationIp", "port"]`). -action | Yes | AggregateAction | The action to be performed for each group. One of the available Aggregate Actions must be provided or you can create custom aggregate actions. `remove_duplicates` and `put_all` are available actions. For more information, see [creating custom aggregate actions](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/aggregate-processor#creating-new-aggregate-actions). -group_duration | No | String | The amount of time that a group should exist before it is concluded automatically. Supports ISO_8601 notation strings ("PT20.345S", "PT15M", etc.) as well as simple notation for seconds (`"60s"`) and milliseconds (`"1500ms"`). Default value is `180s`. - -### date - -Adds a default timestamp to the event or parses timestamp fields, and converts it to ISO 8601 format, which can be used as event timestamp. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -match | Conditionally | List | List of `key` and `patterns` where patterns is a list. The list of match can have exactly one `key` and `patterns`. There is no default value. This option cannot be defined at the same time as `from_time_received`. Include multiple date processors in your pipeline if both options should be used. -from_time_received | Conditionally | Boolean | A boolean that is used for adding default timestamp to event data from event metadata which is the time when source receives the event. Default value is `false`. This option cannot be defined at the same time as `match`. Include multiple date processors in your pipeline if both options should be used. -destination | No | String | Field to store the timestamp parsed by date processor. It can be used with both `match` and `from_time_received`. Default value is `@timestamp`. -source_timezone | No | String | Time zone used to parse dates. It is used in case zone or offset cannot be extracted from the value. If zone or offset are part of the value, then timezone is ignored. Find all the available timezones [the list of database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List) in the "TZ database name" column. -destination_timezone | No | String | Timezone used for storing timestamp in `destination` field. The available timezone values are the same as `source_timestamp`. -locale | No | String | Locale is used for parsing dates. It's commonly used for parsing month names(`MMM`). It can have language, country and variant fields using IETF BCP 47 or String representation of [Locale](https://docs.oracle.com/javase/8/docs/api/java/util/Locale.html) object. For example `en-US` for IETF BCP 47 and `en_US` for string representation of Locale. Full list of locale fields which includes language, country and variant can be found [the language subtag registry](https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry). Default value is `Locale.ROOT`. - -### drop_events - -Drops all the events that are passed into this processor. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -drop_when | Yes | String | Accepts a Data Prepper Expression string following the [Data Prepper Expression Syntax](https://github.com/opensearch-project/data-prepper/blob/main/docs/expression_syntax.md). Configuring `drop_events` with `drop_when: true` drops all the events received. -handle_failed_events | No | Enum | Specifies how exceptions are handled when an exception occurs while evaluating an event. Default value is `drop`, which drops the event so it doesn't get sent to OpenSearch. Available options are `drop`, `drop_silently`, `skip`, `skip_silently`. For more information, see [handle_failed_events](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/drop-events-processor#handle_failed_events). - -### grok_prepper - -Takes unstructured data and utilizes pattern matching to structure and extract important keys and make data more structured and queryable. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -match | No | Map | Specifies which keys to match specific patterns against. Default is an empty body. -keep_empty_captures | No | Boolean | Enables preserving `null` captures. Default value is `false`. -named_captures_only | No | Boolean | enables whether to keep only named captures. Default value is `true`. -break_on_match | No | Boolean | Specifies whether to match all patterns or stop once the first successful match is found. Default is `true`. -keys_to_overwrite | No | List | Specifies which existing keys are to be overwritten if there is a capture with the same key value. Default is `[]`. -pattern_definitions | No | Map | Allows for custom pattern use inline. Default value is an empty body. -patterns_directories | No | List | Specifies the path of directories that contain customer pattern files. Default value is an empty list. -pattern_files_glob | No | String | Specifies which pattern files to use from the directories specified for `pattern_directories`. Default is `*`. -target_key | No | String | Specifies a parent level key to store all captures. Default value is `null`. -timeout_millis | No | Integer | Maximum amount of time that should take place for the matching. Setting to `0` disables the timeout. Default value is `30,000`. - -### key_value - -Takes in a field and parses it into key/value pairs. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -source | No | String | The key in the event that is parsed. Default value is `message`. -destination | No | String | The key where to output the parsed source to. Doing so overwrites the value of the key if it exists. Default value is `parsed_message` -field_delimiter_regex | Conditionally | String | A regex specifying the delimiter between key/value pairs. Special regex characters such as `[` and `]` must be escaped using `\\`. This option cannot be defined at the same time as `field_split_characters`. -field_split_characters | Conditionally | String | A string of characters to split between key/value pairs. Special regex characters such as `[` and `]` must be escaped using `\\`. Default value is `&`. This option cannot be defined at the same time as `field_delimiter_regex`. -key_value_delimiter_regex| Conditionally | String | A regex specifying the delimiter between a key and a value. Special regex characters such as `[` and `]` must be escaped using `\\`. There is no default value. This option cannot be defined at the same time as `value_split_characters`. -value_split_characters | Conditionally | String | A string of characters to split between keys and values. Special regex characters such as `[` and `]` must be escaped using `\\`. Default value is `=`. This option cannot be defined at the same time as `key_value_delimiter_regex`. -non_match_value | No | String | When a key/value cannot be successfully split, the key/value is be placed in the key field and the specified value in the value field. Default value is `null`. -prefix | No | String | A prefix given to all keys. Default value is empty string. -delete_key_regex | No | String | A regex used to delete characters from the key. Special regex characters such as `[` and `]` must be escaped using `\\`. There is no default value. -delete_value_regex | No | String | A regex used to delete characters from the value. Special regex characters such as `[` and `]` must be escaped using `\\`. There is no default value. - -### add_entries - -Adds an entry to event. `add_entries` is part of [mutate event](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-event-processors#mutate-event-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -entries | Yes | List | List of events to be added. Valid entries are `key`, `value`, and `overwrite_if_key_exists`. -key | N/A | N/A | Key of the new event to be added. -value | N/A | N/A | Value of the new entry to be added. Valid data types are strings, booleans, numbers, null, nested objects, and arrays containing the aforementioned data types. -overwrite_if_key_exists | No | Boolean | If true, the existing value gets overwritten if the key already exists within the event. Default is `false`. - -### copy_values - -Copy values within an event. `copy_values` is part of [mutate event](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-event-processors#mutate-event-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -entries | Yes | List | List of entries to be copied. Valid values are `from_key`, `to_key`, and `overwrite_if_key_exists`. -from_key | N/A | N/A | The key of the entry to be copied. -to_key | N/A | N/A | The key of the new entry to be added. -overwrite_if_to_key_exists | No | Boolean | If true, the existing value gets overwritten if the key already exists within the event. Default is `false`. - - -### delete_entries - -Delete entries in an event. `delete_entries` is part of [mutate event](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-event-processors#mutate-event-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -with_keys | Yes | List | An array of keys of the entries to be deleted. - -### rename_keys - -Rename keys in an event. `rename_keys` is part of [mutate event](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-event-processors#mutate-event-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -entries | Yes | List | List of entries. Valid values are `from_key`, `to_key`, and `overwrite_if_key_exists`. Renaming occurs in the order defined. -from_key | N/A | N/A | The key of the entry to be renamed. -to_key | N/A | N/A | The new key of the entry. -overwrite_if_to_key_exists | No | Boolean | If true, the existing value gets overwritten if `to_key` already exists in the event. - -### substitute_string - -Matches a key's value against a regular expression and replaces all matches with a replacement string. `substitute_string` is part of [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -entries | Yes | List | List of entries. Valid values are `source`, `from`, and `to`. -source | N/A | N/A | The key to modify. -from | N/A | N/A | The Regex String to be replaced. Special regex characters such as `[` and `]` must be escaped using `\\` when using double quotes and `\ ` when using single quotes. See [Java Patterns](https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/util/regex/Pattern.html) for more information. -to | N/A | N/A | The String to be substituted for each match of `from`. - -### split_string - -Splits a field into an array using a delimiter character. `split_string` is part of [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -entries | Yes | List | List of entries. Valid values are `source`, `delimiter`, and `delimiter_regex`. -source | N/A | N/A | The key to split. -delimiter | No | N/A | The separator character responsible for the split. Cannot be defined at the same time as `delimiter_regex`. At least `delimiter` or `delimiter_regex` must be defined. -delimiter_regex | No | N/A | The regex string responsible for the split. Cannot be defined at the same time as `delimiter`. At least `delimiter` or `delimiter_regex` must be defined. - -### uppercase_string - -Converts a string to its uppercase counterpart. `uppercase_string` is part of [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -with_keys | Yes | List | A list of keys to convert to uppercase. - -### lowercase_string - -Converts a string to its lowercase counterpart. `lowercase_string` is part of [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -with_keys | Yes | List | A list of keys to convert to lowercase. - -### trim_string - -Strips whitespace from the beginning and end of a key. `trim_string` is part of [mutate string](https://github.com/opensearch-project/data-prepper/tree/main/data-prepper-plugins/mutate-string-processors#mutate-string-processors) processors. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -with_keys | Yes | List | A list of keys to trim the whitespace from. - -## Sinks - -Sinks define where Data Prepper writes your data to. - - -### opensearch - -Sink for an OpenSearch cluster. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -hosts | Yes | List | List of OpenSearch hosts to write to (e.g. `["https://localhost:9200", "https://remote-cluster:9200"]`). -cert | No | String | Path to the security certificate (e.g. `"config/root-ca.pem"`) if the cluster uses the OpenSearch security plugin. -username | No | String | Username for HTTP basic authentication. -password | No | String | Password for HTTP basic authentication. -aws_sigv4 | No | Boolean | default false. Whether to use IAM signing to connect to an Amazon OpenSearch Service domain. For your access key, secret key, and optional session token, Data Prepper uses the default credential chain (environment variables, Java system properties, `~/.aws/credential`, etc.). -aws_region | No | String | AWS region (e.g. `"us-east-1"`) for the domain if you are connecting to Amazon OpenSearch Service. -aws_sts_role_arn | No | String | IAM role which the sink plugin assumes to sign request to Amazon OpenSearch Service. If not provided the plugin uses the default credentials. -socket_timeout | No | Integer | the timeout in milliseconds for waiting for data (or, put differently, a maximum period inactivity between two consecutive data packets). A timeout value of zero is interpreted as an infinite timeout. If this timeout value is either negative or not set, the underlying Apache HttpClient would rely on operating system settings for managing socket timeouts. -connect_timeout | No | Integer | The timeout in milliseconds used when requesting a connection from the connection manager. A timeout value of zero is interpreted as an infinite timeout. If this timeout value is either negative or not set, the underlying Apache HttpClient would rely on operating system settings for managing connection timeouts. -insecure | No | Boolean | Whether to verify SSL certificates. If set to true, CA certificate verification is disabled and insecure HTTP requests are sent instead. Default is `false`. -proxy | No | String | The address of a [forward HTTP proxy server](https://en.wikipedia.org/wiki/Proxy_server). The format is "<host name or IP>:<port>". Examples: "example.com:8100", "http://example.com:8100", "112.112.112.112:8100". Port number cannot be omitted. -trace_analytics_raw | No | Boolean | Deprecated in favor of `index_type`. Whether to export as trace data to the `otel-v1-apm-span-*` index pattern (alias `otel-v1-apm-span`) for use with the Trace Analytics OpenSearch Dashboards plugin. Default is `false`. -trace_analytics_service_map | No | Boolean | Deprecated in favor of `index_type`. Whether to export as trace data to the `otel-v1-apm-service-map` index for use with the service map component of the Trace Analytics OpenSearch Dashboards plugin. Default is `false`. -index | No | String | Name of the index to export to. Only required if you don't use the `trace-analytics-raw` or `trace-analytics-service-map` presets. In other words, this parameter is applicable and required only if index_type is explicitly `custom` or defaults to `custom`. -index_type | No | String | This index type tells the Sink plugin what type of data it is handling. Valid values: `custom`, `trace-analytics-raw`, `trace-analytics-service-map`, `management-disabled`. Default is `custom`. -template_file | No | String | Path to a JSON [index template]({{site.url}}{{site.baseurl}}/opensearch/index-templates/) file (e.g. `/your/local/template-file.json` if you do not use the `trace_analytics_raw` or `trace_analytics_service_map`.) See [otel-v1-apm-span-index-template.json](https://github.com/opensearch-project/data-prepper/blob/main/data-prepper-plugins/opensearch/src/main/resources/otel-v1-apm-span-index-template.json) for an example. -document_id_field | No | String | The field from the source data to use for the OpenSearch document ID (e.g. `"my-field"`) if you don't use the `trace_analytics_raw` or `trace_analytics_service_map` presets. -dlq_file | No | String | The path to your preferred dead letter queue file (e.g. `/your/local/dlq-file`). Data Prepper writes to this file when it fails to index a document on the OpenSearch cluster. -bulk_size | No | Integer (long) | The maximum size (in MiB) of bulk requests to the OpenSearch cluster. Values below 0 indicate an unlimited size. If a single document exceeds the maximum bulk request size, Data Prepper sends it individually. Default is 5. -ism_policy_file | No | String | The absolute file path for an ISM (Index State Management) policy JSON file. This policy file is effective only when there is no built-in policy file for the index type. For example, `custom` index type is currently the only one without a built-in policy file, thus it would use the policy file here if it's provided through this parameter. For more information, see [ISM policies]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/). -number_of_shards | No | Integer | The number of primary shards that an index should have on the destination OpenSearch server. This parameter is effective only when `template_file` is either explicitly provided in Sink configuration or built-in. If this parameter is set, it would override the value in index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). -number_of_replicas | No | Integer | The number of replica shards each primary shard should have on the destination OpenSearch server. For example, if you have 4 primary shards and set number_of_replicas to 3, the index has 12 replica shards. This parameter is effective only when `template_file` is either explicitly provided in Sink configuration or built-in. If this parameter is set, it would override the value in index template file. For more information, see [Create index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/create-index/). - -### file - -Sink for flat file output. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -path | Yes | String | Path for the output file (e.g. `logs/my-transformed-log.log`). - - -### pipeline - -Sink for writing to another pipeline. - -Option | Required | Type | Description -:--- | :--- | :--- | :--- -name | Yes | String | Name of the pipeline to write to. - - -### stdout - -Sink for console output. Can be useful for testing. No options. diff --git a/_clients/data-prepper/get-started.md b/_clients/data-prepper/get-started.md deleted file mode 100644 index 11ef4ea96e..0000000000 --- a/_clients/data-prepper/get-started.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -layout: default -title: Get Started -parent: Data Prepper -nav_order: 1 ---- - -# Get started with Data Prepper - -Data Prepper is an independent component, not an OpenSearch plugin, that converts data for use with OpenSearch. It's not bundled with the all-in-one OpenSearch installation packages. - -## 1. Install Data Prepper - -To use the Docker image, pull it like any other image: - -```bash -docker pull opensearchproject/data-prepper:latest -``` - -## 2. Define a pipeline - -Create a Data Prepper pipeline file, `pipelines.yaml`, with the following configuration: - -```yml -simple-sample-pipeline: - workers: 2 - delay: "5000" - source: - random: - sink: - - stdout: -``` - -## 3. Start Data Prepper - -Run the following command with your pipeline configuration YAML. - -```bash -docker run --name data-prepper \ - -v /full/path/to/pipelines.yaml:/usr/share/data-prepper/pipelines.yaml \ - opensearchproject/opensearch-data-prepper:latest -``` - -This sample pipeline configuration above demonstrates a simple pipeline with a source (`random`) sending data to a sink (`stdout`). For more examples and details on more advanced pipeline configurations, see [Pipelines]({{site.url}}{{site.baseurl}}/clients/data-prepper/pipelines). - -After starting Data Prepper, you should see log output and some UUIDs after a few seconds: - -```yml -2021-09-30T20:19:44,147 [main] INFO com.amazon.dataprepper.pipeline.server.DataPrepperServer - Data Prepper server running at :4900 -2021-09-30T20:19:44,681 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer -2021-09-30T20:19:45,183 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer -2021-09-30T20:19:45,687 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer -2021-09-30T20:19:46,191 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer -2021-09-30T20:19:46,694 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer -2021-09-30T20:19:47,200 [random-source-pool-0] INFO com.amazon.dataprepper.plugins.source.RandomStringSource - Writing to buffer -2021-09-30T20:19:49,181 [simple-test-pipeline-processor-worker-1-thread-1] INFO com.amazon.dataprepper.pipeline.ProcessWorker - simple-test-pipeline Worker: Processing 6 records from buffer -07dc0d37-da2c-447e-a8df-64792095fb72 -5ac9b10a-1d21-4306-851a-6fb12f797010 -99040c79-e97b-4f1d-a70b-409286f2a671 -5319a842-c028-4c17-a613-3ef101bd2bdd -e51e700e-5cab-4f6d-879a-1c3235a77d18 -b4ed2d7e-cf9c-4e9d-967c-b18e8af35c90 -``` diff --git a/_clients/data-prepper/index.md b/_clients/data-prepper/index.md deleted file mode 100644 index 7fb833f428..0000000000 --- a/_clients/data-prepper/index.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -layout: default -title: Data Prepper -nav_order: 120 -has_children: true -has_toc: false ---- - -# Data Prepper - -Data Prepper is a server side data collector capable of filtering, enriching, transforming, normalizing and aggregating data for downstream analytics and visualization. - -Data Prepper lets users build custom pipelines to improve the operational view of applications. Two common uses for Data Prepper are trace and log analytics. [Trace analytics]({{site.url}}{{site.baseurl}}/observability-plugin/trace/index/) can help you visualize the flow of events and identify performance problems, and [log analytics]({{site.url}}{{site.baseurl}}/observability-plugin/log-analytics/) can improve searching, analyzing and provide insights into your application. - -To get started building your own custom pipelines with Data Prepper, see the [Get Started]({{site.url}}{{site.baseurl}}/clients/data-prepper/get-started/) guide. diff --git a/_clients/data-prepper/pipelines.md b/_clients/data-prepper/pipelines.md deleted file mode 100644 index ee360e4c93..0000000000 --- a/_clients/data-prepper/pipelines.md +++ /dev/null @@ -1,286 +0,0 @@ ---- -layout: default -title: Pipelines -parent: Data Prepper -nav_order: 2 ---- - -# Pipelines - -![Data Prepper Pipeline]({{site.url}}{{site.baseurl}}/images/data-prepper-pipeline.png) - -To use Data Prepper, you define pipelines in a configuration YAML file. Each pipeline is a combination of a source, a buffer, zero or more preppers, and one or more sinks. For example: - -```yml -simple-sample-pipeline: - workers: 2 # the number of workers - delay: 5000 # in milliseconds, how long workers wait between read attempts - source: - random: - buffer: - bounded_blocking: - buffer_size: 1024 # max number of records the buffer accepts - batch_size: 256 # max number of records the buffer drains after each read - processor: - - string_converter: - upper_case: true - sink: - - stdout: -``` - -- Sources define where your data comes from. In this case, the source is a random UUID generator (`random`). - -- Buffers store data as it passes through the pipeline. - - By default, Data Prepper uses its one and only buffer, the `bounded_blocking` buffer, so you can omit this section unless you developed a custom buffer or need to tune the buffer settings. - -- Preppers perform some action on your data: filter, transform, enrich, etc. - - You can have multiple preppers, which run sequentially from top to bottom, not in parallel. The `string_converter` prepper transform the strings by making them uppercase. - -- Sinks define where your data goes. In this case, the sink is stdout. - -## Examples - -This section provides some pipeline examples that you can use to start creating your own pipelines. For more information, see [Data Prepper configuration reference]({{site.url}}{{site.baseurl}}/clients/data-prepper/data-prepper-reference/) guide. - -The Data Prepper repository has several [sample applications](https://github.com/opensearch-project/data-prepper/tree/main/examples) to help you get started. - -### Log ingestion pipeline - -The following example demonstrates how to use HTTP source and Grok prepper plugins to process unstructured log data. - -```yml -log-pipeline: - source: - http: - ssl: false - processor: - - grok: - match: - log: [ "%{COMMONAPACHELOG}" ] - sink: - - opensearch: - hosts: [ "https://opensearch:9200" ] - insecure: true - username: admin - password: admin - index: apache_logs -``` - -This example uses weak security. We strongly recommend securing all plugins which open external ports in production environments. -{: .note} - -### Trace analytics pipeline - -The following example demonstrates how to build a pipeline that supports the [Trace Analytics OpenSearch Dashboards plugin]({{site.url}}{{site.baseurl}}/observability-plugin/trace/ta-dashboards/). This pipeline takes data from the OpenTelemetry Collector and uses two other pipelines as sinks. These two separate pipelines index trace and the service map documents for the dashboard plugin. - -#### Classic - -This pipeline definition will be deprecated in 2.0. Users are recommended to use [Event record type](#event-record-type) pipeline definition. - -```yml -entry-pipeline: - delay: "100" - source: - otel_trace_source: - ssl: false - sink: - - pipeline: - name: "raw-pipeline" - - pipeline: - name: "service-map-pipeline" -raw-pipeline: - source: - pipeline: - name: "entry-pipeline" - processor: - - otel_trace_raw_prepper: - sink: - - opensearch: - hosts: ["https://localhost:9200"] - insecure: true - username: admin - password: admin - trace_analytics_raw: true -service-map-pipeline: - delay: "100" - source: - pipeline: - name: "entry-pipeline" - processor: - - service_map_stateful: - sink: - - opensearch: - hosts: ["https://localhost:9200"] - insecure: true - username: admin - password: admin - trace_analytics_service_map: true -``` - -#### Event record type - -Starting from Data Prepper 1.4, Data Prepper supports event record type in trace analytics pipeline source, buffer, and processors. - -```yml -entry-pipeline: - delay: "100" - source: - otel_trace_source: - ssl: false - record_type: event - buffer: - bounded_blocking: - buffer_size: 10240 - batch_size: 160 - sink: - - pipeline: - name: "raw-pipeline" - - pipeline: - name: "service-map-pipeline" -raw-pipeline: - source: - pipeline: - name: "entry-pipeline" - buffer: - bounded_blocking: - buffer_size: 10240 - batch_size: 160 - processor: - - otel_trace_raw: - sink: - - opensearch: - hosts: ["https://localhost:9200"] - insecure: true - username: admin - password: admin - trace_analytics_raw: true -service-map-pipeline: - delay: "100" - source: - pipeline: - name: "entry-pipeline" - buffer: - bounded_blocking: - buffer_size: 10240 - batch_size: 160 - processor: - - service_map_stateful: - sink: - - opensearch: - hosts: ["https://localhost:9200"] - insecure: true - username: admin - password: admin - trace_analytics_service_map: true -``` - -Note that it is recommended to scale the `buffer_size` and `batch_size` by the estimated maximum batch size in the client request payload to maintain similar ingestion throughput and latency as in [Classic](#classic). - -### Metrics pipeline - -Data Prepper supports metrics ingestion using OTel. It currently supports the following metric types: - -* Gauge -* Sum -* Summary -* Histogram - -Other types are not supported. Data Prepper drops all other types, including Exponential Histogram and Summary. Additionally, Data Prepper does not support Scope instrumentation. - -To set up a metrics pipeline: - -```yml -metrics-pipeline: - source: - otel_trace_source: - processor: - - otel_metrics_raw_processor: - sink: - - opensearch: - hosts: ["https://localhost:9200"] - username: admin - password: admin -``` - -### S3 log ingestion pipeline - -The following example demonstrates how to use the S3 Source and Grok Processor plugins to process unstructured log data -from [Amazon Simple Storage Service](https://aws.amazon.com/s3/) (Amazon S3). This example uses Application Load -Balancer logs. As the Application Load Balancer writes logs to S3, S3 creates notifications in Amazon SQS. Data Prepper -reads those notifications and reads the S3 objects to get the log data and process it. - -``` -log-pipeline: - source: - s3: - notification_type: "sqs" - compression: "gzip" - codec: - newline: - sqs: - queue_url: "https://sqs.us-east-1.amazonaws.com/12345678910/ApplicationLoadBalancer" - aws: - region: "us-east-1" - sts_role_arn: "arn:aws:iam::12345678910:role/Data-Prepper" - - processor: - - grok: - match: - message: ["%{DATA:type} %{TIMESTAMP_ISO8601:time} %{DATA:elb} %{DATA:client} %{DATA:target} %{BASE10NUM:request_processing_time} %{DATA:target_processing_time} %{BASE10NUM:response_processing_time} %{BASE10NUM:elb_status_code} %{DATA:target_status_code} %{BASE10NUM:received_bytes} %{BASE10NUM:sent_bytes} \"%{DATA:request}\" \"%{DATA:user_agent}\" %{DATA:ssl_cipher} %{DATA:ssl_protocol} %{DATA:target_group_arn} \"%{DATA:trace_id}\" \"%{DATA:domain_name}\" \"%{DATA:chosen_cert_arn}\" %{DATA:matched_rule_priority} %{TIMESTAMP_ISO8601:request_creation_time} \"%{DATA:actions_executed}\" \"%{DATA:redirect_url}\" \"%{DATA:error_reason}\" \"%{DATA:target_list}\" \"%{DATA:target_status_code_list}\" \"%{DATA:classification}\" \"%{DATA:classification_reason}"] - - grok: - match: - request: ["(%{NOTSPACE:http_method})? (%{NOTSPACE:http_uri})? (%{NOTSPACE:http_version})?"] - - grok: - match: - http_uri: ["(%{WORD:protocol})?(://)?(%{IPORHOST:domain})?(:)?(%{INT:http_port})?(%{GREEDYDATA:request_uri})?"] - - date: - from_time_received: true - destination: "@timestamp" - - - sink: - - opensearch: - hosts: [ "https://localhost:9200" ] - username: "admin" - password: "admin" - index: alb_logs -``` - -## Migrating from Logstash - -Data Prepper supports Logstash configuration files for a limited set of plugins. Simply use the logstash config to run Data Prepper. - -```bash -docker run --name data-prepper \ - -v /full/path/to/logstash.conf:/usr/share/data-prepper/pipelines.conf \ - opensearchproject/data-prepper:latest -``` - -This feature is limited by feature parity of Data Prepper. As of Data Prepper 1.2 release, the following plugins from the Logstash configuration are supported: - -- HTTP Input plugin -- Grok Filter plugin -- Elasticsearch Output plugin -- Amazon Elasticsearch Output plugin - -## Configure the Data Prepper server - -Data Prepper itself provides administrative HTTP endpoints such as `/list` to list pipelines and `/metrics/prometheus` to provide Prometheus-compatible metrics data. The port that has these endpoints has a TLS configuration and is specified by a separate YAML file. By default, these endpoints are secured by Data Prepper docker images. We strongly recommend providing your own configuration file for securing production environments. Here is an example `data-prepper-config.yaml`: - -```yml -ssl: true -keyStoreFilePath: "/usr/share/data-prepper/keystore.jks" -keyStorePassword: "password" -privateKeyPassword: "other_password" -serverPort: 1234 -``` - -To configure the Data Prepper server, run Data Prepper with the additional yaml file. - -```bash -docker run --name data-prepper -v /full/path/to/pipelines.yaml:/usr/share/data-prepper/pipelines.yaml \ - /full/path/to/data-prepper-config.yaml:/usr/share/data-prepper/data-prepper-config.yaml \ - opensearchproject/data-prepper:latest -```` diff --git a/_clients/javascript/helpers.md b/_clients/javascript/helpers.md deleted file mode 100644 index 9efd74d305..0000000000 --- a/_clients/javascript/helpers.md +++ /dev/null @@ -1,202 +0,0 @@ ---- -layout: default -title: Helper methods -parent: JavaScript client -nav_order: 2 ---- - -# Helper methods - -Helper methods simplify the use of complicated API tasks. - -## Bulk helper - -The bulk helper simplifies making complex bulk API requests. - -### Usage - -The following code creates a bulk helper instance: - -```javascript -const { Client } = require('@opensearch-project/opensearch') -const documents = require('./docs.json') - -const client = new Client({ ... }) - -const result = await client.helpers.bulk({ - datasource: documents, - onDocument (doc) { - return { - index: { _index: 'example-index' } - } - } -}) - -console.log(result) -``` -{% include copy.html %} - -Bulk helper operations return an object with the following fields: - -```json -{ - total: number, - failed: number, - retry: number, - successful: number, - time: number, - bytes: number, - aborted: boolean -} -``` - -#### Bulk helper configuration options - -When creating a new bulk helper instance, you can use the following configuration options. - -| Option | Data type | Required/Default | Description -| :--- | :--- | :--- | :--- -| `datasource` | An array, async generator or a readable stream of strings or objects | Required | Represents the documents you need to create, delete, index, or update. -| `onDocument` | Function | Required | A function to be invoked with each document in the given `datasource`. It returns the operation to be executed for this document. Optionally, the document can be manipulated for `create` and `index` operations by returning a new document as part of the function's result. -| `concurrency` | Integer | Optional. Default is 5. | The number of requests to be executed in parallel. -| `flushBytes` | Integer | Optional. Default is 5,000,000. | Maximum bulk body size to send in bytes. -| `flushInterval` | Integer | Optional. Default is 30,000. | Time in milliseconds to wait before flushing the body after the last document has been read. -| `onDrop` | Function | Optional. Default is `noop`. | A function to be invoked for every document that can’t be indexed after reaching the maximum number of retries. -| `refreshOnCompletion` | Boolean | Optional. Default is false. | Whether or not a refresh should be run on all affected indexes at the end of the bulk operation. -| `retries` | Integer | Optional. Defaults to the client's `maxRetries` value. | The number of times an operation is retried before `onDrop` is called for that document. -| `wait` | Integer | Optional. Default is 5,000. | Time in milliseconds to wait before retrying an operation. - -### Examples - -The following examples illustrate the index, create, update, and delete bulk helper operations. - -#### Index - -The index operation creates a new document if it doesn’t exist and recreates the document if it already exists. - -The following bulk operation indexes documents into `example-index`: - -```javascript -client.helpers.bulk({ - datasource: arrayOfDocuments, - onDocument (doc) { - return { - index: { _index: 'example-index' } - } - } -}) -``` -{% include copy.html %} - -The following bulk operation indexes documents into `example-index` with document overwrite: - -```javascript -client.helpers.bulk({ - datasource: arrayOfDocuments, - onDocument (doc) { - return [ - { - index: { _index: 'example-index' } - }, - { ...doc, createdAt: new Date().toISOString() } - ] - } -}) -``` -{% include copy.html %} - -#### Create - -The create operation creates a new document only if the document does not already exist. - -The following bulk operation creates documents in the `example-index`: - -```javascript -client.helpers.bulk({ - datasource: arrayOfDocuments, - onDocument (doc) { - return { - create: { _index: 'example-index', _id: doc.id } - } - } -}) -``` -{% include copy.html %} - -The following bulk operation creates documents in the `example-index` with document overwrite: - -```javascript -client.helpers.bulk({ - datasource: arrayOfDocuments, - onDocument (doc) { - return [ - { - create: { _index: 'example-index', _id: doc.id } - }, - { ...doc, createdAt: new Date().toISOString() } - ] - } -}) -``` -{% include copy.html %} - -#### Update - -The update operation updates the document with the fields being sent. The document must already exist in the index. - -The following bulk operation updates documents in the `arrayOfDocuments`: - -```javascript -client.helpers.bulk({ - datasource: arrayOfDocuments, - onDocument (doc) { - // The update operation always requires a tuple to be returned, with the - // first element being the action and the second being the update options. - return [ - { - update: { _index: 'example-index', _id: doc.id } - }, - { doc_as_upsert: true } - ] - } -}) -``` -{% include copy.html %} - -The following bulk operation updates documents in the `arrayOfDocuments` with document overwrite: - -```javascript -client.helpers.bulk({ - datasource: arrayOfDocuments, - onDocument (doc) { - return [ - { - update: { _index: 'example-index', _id: doc.id } - }, - { - doc: { ...doc, createdAt: new Date().toISOString() }, - doc_as_upsert: true - } - ] - } -}) -``` -{% include copy.html %} - -#### Delete - -The delete operation deletes a document. - -The following bulk operation deletes documents from the `example-index`: - -```javascript -client.helpers.bulk({ - datasource: arrayOfDocuments, - onDocument (doc) { - return { - delete: { _index: 'example-index', _id: doc.id } - } - } -}) -``` -{% include copy.html %} \ No newline at end of file diff --git a/_clients/javascript/index.md b/_clients/javascript/index.md deleted file mode 100644 index 435d0d822a..0000000000 --- a/_clients/javascript/index.md +++ /dev/null @@ -1,356 +0,0 @@ ---- -layout: default -title: JavaScript client -nav_order: 40 ---- - -# JavaScript client - -The OpenSearch JavaScript (JS) client provides a safer and easier way to interact with your OpenSearch cluster. Rather than using OpenSearch from the browser and potentially exposing your data to the public, you can build an OpenSearch client that takes care of sending requests to your cluster. For the client's complete API documentation and additional examples, see the [JS client API documentation](https://opensearch-project.github.io/opensearch-js/2.1/index.html). - -The client contains a library of APIs that let you perform different operations on your cluster and return a standard response body. The example here demonstrates some basic operations like creating an index, adding documents, and searching your data. - -## Setup - -To add the client to your project, install it from [npm](https://www.npmjs.com): - -```bash -npm install @opensearch-project/opensearch -``` -{% include copy.html %} - -To install a specific major version of the client, run the following command: - -```bash -npm install @opensearch-project/opensearch@ -``` -{% include copy.html %} - -If you prefer to add the client manually or just want to examine the source code, see [opensearch-js](https://github.com/opensearch-project/opensearch-js) on GitHub. - -Then require the client: - -```javascript -const { Client } = require("@opensearch-project/opensearch"); -``` -{% include copy.html %} - -## Connecting to OpenSearch - -To connect to the default OpenSearch host, create a client object with the address `https://localhost:9200` if you are using the Security plugin: - -```javascript -var host = "localhost"; -var protocol = "https"; -var port = 9200; -var auth = "admin:admin"; // For testing only. Don't store credentials in code. -var ca_certs_path = "/full/path/to/root-ca.pem"; - -// Optional client certificates if you don't want to use HTTP basic authentication. -// var client_cert_path = '/full/path/to/client.pem' -// var client_key_path = '/full/path/to/client-key.pem' - -// Create a client with SSL/TLS enabled. -var { Client } = require("@opensearch-project/opensearch"); -var fs = require("fs"); -var client = new Client({ - node: protocol + "://" + auth + "@" + host + ":" + port, - ssl: { - ca: fs.readFileSync(ca_certs_path), - // You can turn off certificate verification (rejectUnauthorized: false) if you're using - // self-signed certificates with a hostname mismatch. - // cert: fs.readFileSync(client_cert_path), - // key: fs.readFileSync(client_key_path) - }, -}); -``` -{% include copy.html %} - -## Creating an index - -To create an OpenSearch index, use the `indices.create()` method. You can use the following code to construct a JSON object with custom settings: - -```javascript -var index_name = "books"; - -var settings = { - settings: { - index: { - number_of_shards: 4, - number_of_replicas: 3, - }, - }, -}; - -var response = await client.indices.create({ - index: index_name, - body: settings, -}); -``` -{% include copy.html %} - -## Indexing a document - -You can index a document into OpenSearch using the client's `index` method: - -```javascript -var document = { - title: "The Outsider", - author: "Stephen King", - year: "2018", - genre: "Crime fiction", -}; - -var id = "1"; - -var response = await client.index({ - id: id, - index: index_name, - body: document, - refresh: true, -}); -``` -{% include copy.html %} - -## Searching for documents - -The easiest way to search for documents is to construct a query string. The following code uses a `match` query to search for "The Outsider" in the title field: - -```javascript -var query = { - query: { - match: { - title: { - query: "The Outsider", - }, - }, - }, -}; - -var response = await client.search({ - index: index_name, - body: query, -}); -``` -{% include copy.html %} - -## Deleting a document - -You can delete a document using the client's `delete` method: - -```javascript -var response = await client.delete({ - index: index_name, - id: id, -}); -``` -{% include copy.html %} - -## Deleting an index - -You can delete an index using the `indices.delete()` method: - -```javascript -var response = await client.indices.delete({ - index: index_name, -}); -``` -{% include copy.html %} - -## Sample program - -The following sample program creates a client, adds an index with non-default settings, inserts a document, searches for the document, deletes the document, and then deletes the index: - -```javascript -"use strict"; - -var host = "localhost"; -var protocol = "https"; -var port = 9200; -var auth = "admin:admin"; // For testing only. Don't store credentials in code. -var ca_certs_path = "/full/path/to/root-ca.pem"; - -// Optional client certificates if you don't want to use HTTP basic authentication. -// var client_cert_path = '/full/path/to/client.pem' -// var client_key_path = '/full/path/to/client-key.pem' - -// Create a client with SSL/TLS enabled. -var { Client } = require("@opensearch-project/opensearch"); -var fs = require("fs"); -var client = new Client({ - node: protocol + "://" + auth + "@" + host + ":" + port, - ssl: { - ca: fs.readFileSync(ca_certs_path), - // You can turn off certificate verification (rejectUnauthorized: false) if you're using - // self-signed certificates with a hostname mismatch. - // cert: fs.readFileSync(client_cert_path), - // key: fs.readFileSync(client_key_path) - }, -}); - -async function search() { - // Create an index with non-default settings. - var index_name = "books"; - - var settings = { - settings: { - index: { - number_of_shards: 4, - number_of_replicas: 3, - }, - }, - }; - - var response = await client.indices.create({ - index: index_name, - body: settings, - }); - - console.log("Creating index:"); - console.log(response.body); - - // Add a document to the index. - var document = { - title: "The Outsider", - author: "Stephen King", - year: "2018", - genre: "Crime fiction", - }; - - var id = "1"; - - var response = await client.index({ - id: id, - index: index_name, - body: document, - refresh: true, - }); - - console.log("Adding document:"); - console.log(response.body); - - // Search for the document. - var query = { - query: { - match: { - title: { - query: "The Outsider", - }, - }, - }, - }; - - var response = await client.search({ - index: index_name, - body: query, - }); - - console.log("Search results:"); - console.log(response.body.hits); - - // Delete the document. - var response = await client.delete({ - index: index_name, - id: id, - }); - - console.log("Deleting document:"); - console.log(response.body); - - // Delete the index. - var response = await client.indices.delete({ - index: index_name, - }); - - console.log("Deleting index:"); - console.log(response.body); -} - -search().catch(console.log); -``` -{% include copy.html %} - -## Authenticating with Amazon OpenSearch Service – AWS Sigv4 - -Use the following code to authenticate with AWS V2 SDK: - -```javascript -const AWS = require('aws-sdk'); // V2 SDK. -const { Client } = require('@opensearch-project/opensearch'); -const { AwsSigv4Signer } = require('@opensearch-project/opensearch/aws'); - -const client = new Client({ - ...AwsSigv4Signer({ - region: 'us-east-1', - // Must return a Promise that resolve to an AWS.Credentials object. - // This function is used to acquire the credentials when the client start and - // when the credentials are expired. - // The Client will refresh the Credentials only when they are expired. - // With AWS SDK V2, Credentials.refreshPromise is used when available to refresh the credentials. - - // Example with AWS SDK V2: - getCredentials: () => - new Promise((resolve, reject) => { - // Any other method to acquire a new Credentials object can be used. - AWS.config.getCredentials((err, credentials) => { - if (err) { - reject(err); - } else { - resolve(credentials); - } - }); - }), - }), - node: "https://search-xxx.region.es.amazonaws.com", // OpenSearch domain URL -}); -``` -{% include copy.html %} - -Use the following code to authenticate with AWS V3 SDK: - -```javascript -const { defaultProvider } = require("@aws-sdk/credential-provider-node"); // V3 SDK. -const { Client } = require('@opensearch-project/opensearch'); -const { AwsSigv4Signer } = require('@opensearch-project/opensearch/aws'); - -const client = new Client({ - ...AwsSigv4Signer({ - region: 'us-east-1', - // Must return a Promise that resolve to an AWS.Credentials object. - // This function is used to acquire the credentials when the client start and - // when the credentials are expired. - // The Client will refresh the Credentials only when they are expired. - // With AWS SDK V2, Credentials.refreshPromise is used when available to refresh the credentials. - - // Example with AWS SDK V3: - getCredentials: () => { - // Any other method to acquire a new Credentials object can be used. - const credentialsProvider = defaultProvider(); - return credentialsProvider(); - }, - }), - node: "https://search-xxx.region.es.amazonaws.com", // OpenSearch domain URL -}); -``` -{% include copy.html %} - -## Circuit breaker - -The `memoryCircuitBreaker` option can be used to prevent errors caused by a response payload being too large to fit into the heap memory available to the client. - -The `memoryCircuitBreaker` object contains two fields: - -- `enabled`: A Boolean used to turn the circuit breaker on or off. Defaults to `false`. -- `maxPercentage`: The threshold that determines whether the circuit breaker engages. Valid values are floats in the [0, 1] range that represent percentages in decimal form. Any value that exceeds that range will correct to `1.0`. - -The following example instantiates a client with the circuit breaker enabled and its threshold set to 80% of the available heap size limit: - -```javascript -var client = new Client({ - memoryCircuitBreaker: { - enabled: true, - maxPercentage: 0.8, - }, -}); -``` -{% include copy.html %} diff --git a/_config.yml b/_config.yml index a044070645..e2ff76349d 100644 --- a/_config.yml +++ b/_config.yml @@ -29,12 +29,9 @@ color_scheme: opensearch collections: # Define a collection named "tests", its documents reside in the "_tests" directory install-and-configure: - permalink: /:collection/:path/ - output: true - upgrade-to: permalink: /:collection/:path/ output: true - opensearch: + upgrade-to: permalink: /:collection/:path/ output: true im-plugin: @@ -45,13 +42,13 @@ collections: output: true security: permalink: /:collection/:path/ - output: true + output: true search-plugins: permalink: /:collection/:path/ - output: true + output: true ml-commons-plugin: permalink: /:collection/:path/ - output: true + output: true tuning-your-cluster: permalink: /:collection/:path/ output: true @@ -61,12 +58,21 @@ collections: observing-your-data: permalink: /:collection/:path/ output: true - ml-commons-plugin: + query-dsl: + permalink: /:collection/:path/ + output: true + field-types: permalink: /:collection/:path/ output: true clients: permalink: /:collection/:path/ output: true + data-prepper: + permalink: /:collection/:path/ + output: true + tools: + permalink: /:collection/:path/ + output: true api-reference: permalink: /:collection/:path/ output: true @@ -88,9 +94,6 @@ just_the_docs: # nav_exclude: true nav_fold: true # search_exclude: true - opensearch: - name: OpenSearch - nav_fold: true im-plugin: name: Managing Indexes nav_fold: true @@ -115,8 +118,20 @@ just_the_docs: observing-your-data: name: Observability nav_fold: true + query-dsl: + name: Query DSL, Aggregations, and Analyzers + nav_fold: true + field-types: + name: Mappings and field types + nav_fold: true clients: - name: Clients and tools + name: Clients + nav_fold: true + data-prepper: + name: Data Prepper + nav_fold: true + tools: + name: Tools nav_fold: true api-reference: name: API reference @@ -124,8 +139,6 @@ just_the_docs: troubleshoot: name: Troubleshooting nav_fold: true - external_links: - name: External links # Enable or disable the site search diff --git a/_opensearch/supported-field-types/alias.md b/_field-types/alias.md similarity index 97% rename from _opensearch/supported-field-types/alias.md rename to _field-types/alias.md index f128ef6570..8cb4e6d22c 100644 --- a/_opensearch/supported-field-types/alias.md +++ b/_field-types/alias.md @@ -4,6 +4,8 @@ title: Alias nav_order: 10 has_children: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/alias/ --- # Alias field type diff --git a/_opensearch/supported-field-types/autocomplete.md b/_field-types/autocomplete.md similarity index 91% rename from _opensearch/supported-field-types/autocomplete.md rename to _field-types/autocomplete.md index 4fc7ad3935..006f07866d 100644 --- a/_opensearch/supported-field-types/autocomplete.md +++ b/_field-types/autocomplete.md @@ -5,6 +5,8 @@ nav_order: 50 has_children: true has_toc: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/autocomplete/ --- # Autocomplete field types diff --git a/_opensearch/supported-field-types/binary.md b/_field-types/binary.md similarity index 95% rename from _opensearch/supported-field-types/binary.md rename to _field-types/binary.md index 2bc95c31ab..7786785be4 100644 --- a/_opensearch/supported-field-types/binary.md +++ b/_field-types/binary.md @@ -4,6 +4,8 @@ title: Binary parent: Supported field types nav_order: 12 has_children: false +redirect_from: + - /opensearch/supported-field-types/binary/ --- # Binary field type diff --git a/_opensearch/supported-field-types/boolean.md b/_field-types/boolean.md similarity index 98% rename from _opensearch/supported-field-types/boolean.md rename to _field-types/boolean.md index 40affbde38..4e5e795f54 100644 --- a/_opensearch/supported-field-types/boolean.md +++ b/_field-types/boolean.md @@ -4,6 +4,8 @@ title: Boolean nav_order: 20 has_children: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/boolean/ --- # Boolean field type diff --git a/_opensearch/supported-field-types/completion.md b/_field-types/completion.md similarity index 98% rename from _opensearch/supported-field-types/completion.md rename to _field-types/completion.md index 0b8e4af678..05fcbaa5de 100644 --- a/_opensearch/supported-field-types/completion.md +++ b/_field-types/completion.md @@ -5,6 +5,8 @@ nav_order: 51 has_children: false parent: Autocomplete field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/completion/ --- # Completion field type @@ -51,7 +53,7 @@ The following table lists the parameters accepted by completion fields. Parameter | Description :--- | :--- -`input` | A list of possible completions as a string or array of strings. Cannot contain `\u0000` (null), `\u001f` (information separator one), or `u001e` (information separator two). Required. +`input` | A list of possible completions as a string or array of strings. Cannot contain `\u0000` (null), `\u001f` (information separator one), or `\u001e` (information separator two). Required. `weight` | A positive integer or a positive integer string for ranking suggestions. Optional. Multiple suggestions can be indexed as follows: diff --git a/_opensearch/supported-field-types/date.md b/_field-types/date.md similarity index 99% rename from _opensearch/supported-field-types/date.md rename to _field-types/date.md index 4a867d065b..075eed360b 100644 --- a/_opensearch/supported-field-types/date.md +++ b/_field-types/date.md @@ -4,6 +4,8 @@ title: Date nav_order: 25 has_children: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/date/ --- # Date field type @@ -343,5 +345,4 @@ The response contains both documents: ] } } -``` - +``` \ No newline at end of file diff --git a/_opensearch/supported-field-types/geo-point.md b/_field-types/geo-point.md similarity index 88% rename from _opensearch/supported-field-types/geo-point.md rename to _field-types/geo-point.md index 36e57013a7..ee40a1339d 100644 --- a/_opensearch/supported-field-types/geo-point.md +++ b/_field-types/geo-point.md @@ -5,6 +5,8 @@ nav_order: 56 has_children: false parent: Geographic field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/geo-point/ --- # Geopoint field type @@ -86,6 +88,19 @@ PUT testindex1/_doc/5 ``` {% include copy-curl.html %} +- GeoJSON format, where the `coordinates` are in the [`longitude`, `latitude`] format + +```json +PUT testindex1/_doc/6 +{ + "point": { + "type": "Point", + "coordinates": [74.00, 40.71] + } +} +``` +{% include copy-curl.html %} + ## Parameters The following table lists the parameters accepted by geopoint field types. All parameters are optional. diff --git a/_opensearch/supported-field-types/geo-shape.md b/_field-types/geo-shape.md similarity index 86% rename from _opensearch/supported-field-types/geo-shape.md rename to _field-types/geo-shape.md index a21156ef5a..cf0506def6 100644 --- a/_opensearch/supported-field-types/geo-shape.md +++ b/_field-types/geo-shape.md @@ -5,6 +5,8 @@ nav_order: 57 has_children: false parent: Geographic field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/geo-shape/ --- # Geoshape field type @@ -46,13 +48,13 @@ The following table describes the possible geoshape types and their relationship OpenSearch type | GeoJSON type | WKT type | Description :--- | :--- | :--- | :--- [`point`](#point) | Point | POINT | A geographic point specified by latitude and longitude. OpenSearch uses World Geodetic System (WGS84) coordinates. -[`linestring`](#line-string) | LineString | LINESTRING | A line specified by two or more points. May be a straight line or a path of connected line segments. +[`linestring`](#linestring) | LineString | LINESTRING | A line specified by two or more points. May be a straight line or a path of connected line segments. [`polygon`](#polygon) | Polygon | POLYGON | A polygon specified by a list of vertices in coordinate form. The polygon must be closed, meaning the last point must be the same as the first point. Therefore, to create an n-gon, n+1 vertices are required. The minimum number of vertices is four, which creates a triangle. -[`multipoint`](#multi-point) | MultiPoint | MULTIPOINT | An array of discrete related points that are not connected. -[`multilinestring`](#multiline-string) | MultiLineString | MULTILINESTRING | An array of linestrings. -[`multipolygon`](#multi-polygon) | MultiPolygon | MULTIPOLYGON | An array of polygons. +[`multipoint`](#multipoint) | MultiPoint | MULTIPOINT | An array of discrete related points that are not connected. +[`multilinestring`](#multilinestring) | MultiLineString | MULTILINESTRING | An array of linestrings. +[`multipolygon`](#multipolygon) | MultiPolygon | MULTIPOLYGON | An array of polygons. [`geometrycollection`](#geometry-collection) | GeometryCollection | GEOMETRYCOLLECTION | A collection of geoshapes that may be of different types. -[`envelope`](#envelope) | N/A | BBOX | A bounding rectangle specified by top-left and bottom-right vertices. +[`envelope`](#envelope) | N/A | BBOX | A bounding rectangle specified by upper-left and lower-right vertices. ## Point @@ -81,11 +83,11 @@ PUT testindex/_doc/1 ``` {% include copy-curl.html %} -## Line string +## Linestring -A line string is a line specified by two or more points. If the points are collinear, the line string is a straight line. Otherwise, the line string represents a path made of line segments. +A linestring is a line specified by two or more points. If the points are collinear, the linestring is a straight line. Otherwise, the linestring represents a path made of line segments. -Index a line string in GeoJSON format: +Index a linestring in GeoJSON format: ```json PUT testindex/_doc/2 @@ -98,7 +100,7 @@ PUT testindex/_doc/2 ``` {% include copy-curl.html %} -Index a line string in WKT format: +Index a linestring in WKT format: ```json PUT testindex/_doc/2 @@ -219,11 +221,11 @@ PUT testindex/_doc/3 ``` {% include copy-curl.html %} -## Multi point +## Multipoint -A multi point is an array of discrete related points that are not connected. +A multipoint is an array of discrete related points that are not connected. -Index a multi point in GeoJSON format: +Index a multipoint in GeoJSON format: ```json PUT testindex/_doc/6 @@ -239,7 +241,7 @@ PUT testindex/_doc/6 ``` {% include copy-curl.html %} -Index a multi point in WKT format: +Index a multipoint in WKT format: ```json PUT testindex/_doc/6 @@ -249,11 +251,11 @@ PUT testindex/_doc/6 ``` {% include copy-curl.html %} -## Multiline string +## Multilinestring -A multiline string is an array of line strings. +A multilinestring is an array of linestrings. -Index a line string in GeoJSON format: +Index a linestring in GeoJSON format: ```json PUT testindex/_doc/2 @@ -269,7 +271,7 @@ PUT testindex/_doc/2 ``` {% include copy-curl.html %} -Index a line string in WKT format: +Index a linestring in WKT format: ```json PUT testindex/_doc/2 @@ -279,11 +281,11 @@ PUT testindex/_doc/2 ``` {% include copy-curl.html %} -## Multi polygon +## Multipolygon -A multi polygon is an array of polygons. In this example, the first polygon contains a hole, and the second does not. +A multipolygon is an array of polygons. In this example, the first polygon contains a hole, and the second does not. -Index a multi polygon in GeoJSON format: +Index a multipolygon in GeoJSON format: ```json PUT testindex/_doc/4 @@ -314,7 +316,7 @@ PUT testindex/_doc/4 ``` {% include copy-curl.html %} -Index a multi polygon in WKT format: +Index a multipolygon in WKT format: ```json PUT testindex/_doc/4 @@ -362,7 +364,7 @@ PUT testindex/_doc/7 ## Envelope -An envelope is a bounding rectangle specified by top-left and bottom-right vertices. The GeoJSON format is `[[minLon, maxLat], [maxLon, minLat]]`. +An envelope is a bounding rectangle specified by upper-left and lower-right vertices. The GeoJSON format is `[[minLon, maxLat], [maxLon, minLat]]`. Index an envelope in GeoJSON format: diff --git a/_opensearch/supported-field-types/geographic.md b/_field-types/geographic.md similarity index 90% rename from _opensearch/supported-field-types/geographic.md rename to _field-types/geographic.md index 9dbe6c291f..34d1a54fd1 100644 --- a/_opensearch/supported-field-types/geographic.md +++ b/_field-types/geographic.md @@ -5,6 +5,8 @@ nav_order: 55 has_children: true has_toc: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/geographic/ --- # Geographic field types diff --git a/_opensearch/supported-field-types/index.md b/_field-types/index.md similarity index 99% rename from _opensearch/supported-field-types/index.md rename to _field-types/index.md index f13c314041..cca833a548 100644 --- a/_opensearch/supported-field-types/index.md +++ b/_field-types/index.md @@ -6,6 +6,7 @@ has_children: true has_toc: false redirect_from: - /opensearch/supported-field-types/ + - //opensearch/supported-field-types/index/ --- # Supported field types diff --git a/_opensearch/supported-field-types/ip.md b/_field-types/ip.md similarity index 98% rename from _opensearch/supported-field-types/ip.md rename to _field-types/ip.md index 250e2da6a0..a3e1277b20 100644 --- a/_opensearch/supported-field-types/ip.md +++ b/_field-types/ip.md @@ -4,6 +4,8 @@ title: IP address nav_order: 30 has_children: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/ip/ --- # IP address field type diff --git a/_opensearch/supported-field-types/join.md b/_field-types/join.md similarity index 99% rename from _opensearch/supported-field-types/join.md rename to _field-types/join.md index 18a25637db..bd14f25082 100644 --- a/_opensearch/supported-field-types/join.md +++ b/_field-types/join.md @@ -5,6 +5,8 @@ nav_order: 43 has_children: false parent: Object field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/join/ --- # Join field type diff --git a/_opensearch/supported-field-types/keyword.md b/_field-types/keyword.md similarity index 98% rename from _opensearch/supported-field-types/keyword.md rename to _field-types/keyword.md index cfa522f599..4624ebb8fd 100644 --- a/_opensearch/supported-field-types/keyword.md +++ b/_field-types/keyword.md @@ -5,6 +5,8 @@ nav_order: 46 has_children: false parent: String field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/keyword/ --- # Keyword field type diff --git a/_opensearch/mappings.md b/_field-types/mappings.md similarity index 69% rename from _opensearch/mappings.md rename to _field-types/mappings.md index d410fac267..6ad58c07e3 100644 --- a/_opensearch/mappings.md +++ b/_field-types/mappings.md @@ -2,9 +2,11 @@ layout: default title: Mapping nav_order: 13 +redirect_from: + - /opensearch/mappings/ --- -# About Mappings +# Mapping You can define how documents and their fields are stored and indexed by creating a mapping. @@ -45,6 +47,7 @@ numeric detection string | If disabled, OpenSearch may automatically process num If you know exactly what your field data types need to be, you can specify them in your request body when creating your index. ```json +PUT sample-index1 { "mappings": { "properties": { @@ -65,6 +68,22 @@ If you know exactly what your field data types need to be, you can specify them } ``` +To add mappings to an existing index or data stream, you can send a request to the `_mapping` endpoint using the `PUT` or `POST` HTTP method: + +```json +POST sample-index1/_mapping +{ + "properties": { + "year": { "type" : "text" }, + "age": { "type" : "integer" }, + "director":{ "type" : "text" } + } +} +``` + +You cannot change the mapping of an existing field, you can only modify the field's mapping parameters. +{: .note} + --- ## Mapping example usage @@ -102,4 +121,62 @@ PUT _index_ip/_doc/ } ``` -This indexed ip_range does not throw an error because `ignore_malformed` is set to true. \ No newline at end of file +This indexed ip_range does not throw an error because `ignore_malformed` is set to true. + +## Get a mapping + +To get all mappings for one or more indexes, use the following request: + +```json +GET /_mapping +``` + +In the above request, `` may be an index name or a comma-separated list of index names. + +To get all mappings for all indexes, use the following request: + +```json +GET _mapping +``` + +To get a mapping for a specific field, provide the index name and the field name: + +```json +GET _mapping/field/ +GET //_mapping/field/ +``` + +Both `` and `` can be specified as one value or a comma-separated list. + +For example, the following request retrieves the mapping for the `year` and `age` fields in `sample-index1`: + +```json +GET sample-index1/_mapping/field/year,age +``` + +The response contains the specified fields: + +```json +{ + "sample-index1" : { + "mappings" : { + "year" : { + "full_name" : "year", + "mapping" : { + "year" : { + "type" : "text" + } + } + }, + "age" : { + "full_name" : "age", + "mapping" : { + "age" : { + "type" : "integer" + } + } + } + } + } +} +``` \ No newline at end of file diff --git a/_opensearch/supported-field-types/nested.md b/_field-types/nested.md similarity index 95% rename from _opensearch/supported-field-types/nested.md rename to _field-types/nested.md index e18ae340a3..a075d238a6 100644 --- a/_opensearch/supported-field-types/nested.md +++ b/_field-types/nested.md @@ -5,6 +5,8 @@ nav_order: 42 has_children: false parent: Object field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/nested/ --- # Nested field type @@ -28,7 +30,7 @@ PUT testindex1/_doc/100 ``` {% include copy-curl.html %} -When these objects are stored, they are flattened, so their internal represenation has an array of all values for each field: +When these objects are stored, they are flattened, so their internal representation has an array of all values for each field: ```json { @@ -145,13 +147,13 @@ Nested objects are stored as separate documents, and the parent object has refer ```json PUT testindex1 { - "mappings" : { - "properties": { - "patients": { - "type" : "nested" - } - } + "mappings" : { + "properties": { + "patients": { + "type" : "nested" + } } + } } ``` {% include copy-curl.html %} diff --git a/_opensearch/supported-field-types/numeric.md b/_field-types/numeric.md similarity index 98% rename from _opensearch/supported-field-types/numeric.md rename to _field-types/numeric.md index c12326e172..e76b64ff0f 100644 --- a/_opensearch/supported-field-types/numeric.md +++ b/_field-types/numeric.md @@ -4,6 +4,8 @@ title: Numeric field types parent: Supported field types nav_order: 15 has_children: false +redirect_from: + - /opensearch/supported-field-types/numeric/ --- # Numeric field types diff --git a/_opensearch/supported-field-types/object-fields.md b/_field-types/object-fields.md similarity index 100% rename from _opensearch/supported-field-types/object-fields.md rename to _field-types/object-fields.md diff --git a/_opensearch/supported-field-types/object.md b/_field-types/object.md similarity index 98% rename from _opensearch/supported-field-types/object.md rename to _field-types/object.md index 68cb15cb83..7bb446e154 100644 --- a/_opensearch/supported-field-types/object.md +++ b/_field-types/object.md @@ -5,6 +5,8 @@ nav_order: 41 has_children: false parent: Object field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/object/ --- # Object field type diff --git a/_opensearch/supported-field-types/percolator.md b/_field-types/percolator.md similarity index 97% rename from _opensearch/supported-field-types/percolator.md rename to _field-types/percolator.md index 393c9e0aaa..e39f1cad82 100644 --- a/_opensearch/supported-field-types/percolator.md +++ b/_field-types/percolator.md @@ -4,6 +4,8 @@ title: Percolator nav_order: 65 has_children: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/percolator/ --- # Percolator field type diff --git a/_opensearch/supported-field-types/range.md b/_field-types/range.md similarity index 90% rename from _opensearch/supported-field-types/range.md rename to _field-types/range.md index 31160219ed..3815053fbf 100644 --- a/_opensearch/supported-field-types/range.md +++ b/_field-types/range.md @@ -4,6 +4,8 @@ title: Range field types nav_order: 35 has_children: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/range/ --- # Range field types @@ -58,7 +60,7 @@ PUT testindex/_doc/1 ``` {% include copy-curl.html %} -You can use a [term query](#term-query) or a [range query](#range-query) to search for values within range fields. +You can use a [Term query](#term-query) or a [Range query](#range-query) to search for values within range fields. ### Term query @@ -82,14 +84,17 @@ GET testindex/_search ### Range query -A range query on a range field returns documents within that range. Along with the field to be matched, range queries take the following optional parameters. +A range query on a range field returns documents within that range. Along with the field to be matched, you can further specify a date format or relational operators with the following optional parameters: -Parameter | Description -:--- | :--- +Parameter | Description +:--- | :--- format | A [format]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/#formats) for dates in this query. Default is the field's mapped format. relation | Provides a relation between the query's date range and the document's date range. There are three types of relations that you can specify:
1. `intersects` matches documents for which there are dates that belong to both the query's date range and document's date range. This is the default.
2. `contains` matches documents for which the query's date range is a subset of the document's date range.
3. `within` matches documents for which the document's date range is a subset of the query's date range. -To use a date format other than the field's mapped format in a query, specify it in the `format` field. +To use a date format other than the field's mapped format in a query, specify it in the `format` field. + +For a full description of range query usage, including all range query parameters, see [Range query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/term/#range). +{: .tip } Query for all graduation dates in 2019, providing the date range in a "MM/dd/yyyy" format: diff --git a/_opensearch/supported-field-types/rank.md b/_field-types/rank.md similarity index 99% rename from _opensearch/supported-field-types/rank.md rename to _field-types/rank.md index 50aab1bc64..691fbbb7cc 100644 --- a/_opensearch/supported-field-types/rank.md +++ b/_field-types/rank.md @@ -4,6 +4,8 @@ title: Rank field types nav_order: 60 has_children: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/rank/ --- # Rank field types diff --git a/_opensearch/supported-field-types/search-as-you-type.md b/_field-types/search-as-you-type.md similarity index 98% rename from _opensearch/supported-field-types/search-as-you-type.md rename to _field-types/search-as-you-type.md index eb4d863ef0..fdef9c0c7a 100644 --- a/_opensearch/supported-field-types/search-as-you-type.md +++ b/_field-types/search-as-you-type.md @@ -5,6 +5,8 @@ nav_order: 53 has_children: false parent: Autocomplete field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/search-as-you-type/ --- # Search-as-you-type field type diff --git a/_opensearch/supported-field-types/string.md b/_field-types/string.md similarity index 91% rename from _opensearch/supported-field-types/string.md rename to _field-types/string.md index 7848d2ab4f..304fd434c0 100644 --- a/_opensearch/supported-field-types/string.md +++ b/_field-types/string.md @@ -5,6 +5,8 @@ nav_order: 45 has_children: true has_toc: false parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/string/ --- # String field types diff --git a/_opensearch/supported-field-types/text.md b/_field-types/text.md similarity index 99% rename from _opensearch/supported-field-types/text.md rename to _field-types/text.md index 37f3a6c07b..c68f678f94 100644 --- a/_opensearch/supported-field-types/text.md +++ b/_field-types/text.md @@ -5,6 +5,8 @@ nav_order: 47 has_children: false parent: String field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/text/ --- # Text field type diff --git a/_opensearch/supported-field-types/token-count.md b/_field-types/token-count.md similarity index 98% rename from _opensearch/supported-field-types/token-count.md rename to _field-types/token-count.md index c1795af3a4..adc5b7257f 100644 --- a/_opensearch/supported-field-types/token-count.md +++ b/_field-types/token-count.md @@ -5,6 +5,8 @@ nav_order: 48 has_children: false parent: String field types grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/token-count/ --- # Token count field type diff --git a/_field-types/xy-point.md b/_field-types/xy-point.md new file mode 100644 index 0000000000..b9edababf5 --- /dev/null +++ b/_field-types/xy-point.md @@ -0,0 +1,105 @@ +--- +layout: default +title: xy point +nav_order: 58 +has_children: false +parent: Cartesian field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/xy-point/ +--- + +# xy point field type + +An xy point field type contains a point in a two-dimensional Cartesian coordinate system, specified by x and y coordinates. It is based on the Lucene [XYPoint](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/geo/XYPoint.html) field type. The xy point field type is similar to the [geopoint]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) field type, but does not have the range limitations of geopoint. The coordinates of an xy point are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). + +## Example + +Create a mapping with an xy point field type: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "point": { + "type": "xy_point" + } + } + } +} +``` +{% include copy-curl.html %} + +## Formats + +xy points can be indexed in the following formats: + +- An object with x and y coordinates + +```json +PUT testindex1/_doc/1 +{ + "point": { + "x": 0.5, + "y": 4.5 + } +} +``` +{% include copy-curl.html %} + +- A string in the "`x`, `y`" format + +```json +PUT testindex1/_doc/2 +{ + "point": "0.5, 4.5" +} +``` +{% include copy-curl.html %} + +- An array in the [`x`, `y`] format + +```json +PUT testindex1/_doc/3 +{ + "point": [0.5, 4.5] +} +``` +{% include copy-curl.html %} + +- A [well-known text (WKT)](https://docs.opengeospatial.org/is/12-063r5/12-063r5.html) POINT in the "POINT(`x` `y`)" format + +```json +PUT testindex1/_doc/4 +{ + "point": "POINT (0.5 4.5)" +} +``` +{% include copy-curl.html %} + +- GeoJSON format + +```json +PUT testindex1/_doc/5 +{ + "point" : { + "type" : "Point", + "coordinates" : [0.5, 4.5] + } +} +``` +{% include copy-curl.html %} + +In all xy point formats, the coordinates must be specified in the `x, y` order. +{: .note} + +## Parameters + +The following table lists the parameters accepted by xy point field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`ignore_malformed` | A Boolean value that specifies to ignore malformed values and not to throw an exception. Default is `false`. +`ignore_z_value` | Specific to points with three coordinates. If `ignore_z_value` is `true`, the third coordinate is not indexed but is still stored in the _source field. If `ignore_z_value` is `false`, an exception is thrown. +[`null_value`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/index#null-value) | A value to be used in place of `null`. The value must be of the same type as the field. If this parameter is not specified, the field is treated as missing when its value is `null`. Default is `null`. \ No newline at end of file diff --git a/_field-types/xy-shape.md b/_field-types/xy-shape.md new file mode 100644 index 0000000000..e6f95b732a --- /dev/null +++ b/_field-types/xy-shape.md @@ -0,0 +1,403 @@ +--- +layout: default +title: xy shape +nav_order: 59 +has_children: false +parent: Cartesian field types +grand_parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/xy-shape/ +--- + +# xy shape field type + +An xy shape field type contains a shape, such as a polygon or a collection of xy points. It is based on the Lucene [XYShape](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/document/XYShape.html) field type. To index an xy shape, OpenSearch tessellates the shape into a triangular mesh and stores each triangle in a BKD tree (a set of balanced k-dimensional trees). This provides a 10-7decimal degree of precision, which represents near-perfect spatial resolution. + +The xy shape field type is similar to the [geoshape]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-shape/) field type, but it represents shapes on the Cartesian plane, which is not based on the Earth-fixed terrestrial reference system. The coordinates of an xy shape are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). + +## Example + +Create a mapping with an xy shape field type: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "location": { + "type": "xy_shape" + } + } + } +} +``` +{% include copy-curl.html %} + +## Formats + +xy shapes can be indexed in the following formats: + +- [GeoJSON](https://geojson.org/) +- [Well-known text (WKT)](https://docs.opengeospatial.org/is/12-063r5/12-063r5.html) + +In both GeoJSON and WKT, the coordinates must be specified in the `x, y` order within coordinate arrays. +{: .note} + +## xy shape types + +The following table describes the possible xy shape types and their relationship to the GeoJSON and WKT types. + +OpenSearch type | GeoJSON type | WKT type | Description +:--- | :--- | :--- | :--- +[`point`](#point) | Point | POINT | A geographic point specified by the x and y coordinates. +[`linestring`](#linestring) | LineString | LINESTRING | A line specified by two or more points. May be a straight line or a path of connected line segments. +[`polygon`](#polygon) | Polygon | POLYGON | A polygon specified by a list of vertices in coordinate form. The polygon must be closed, meaning the last point must be the same as the first point. Therefore, to create an n-gon, n+1 vertices are required. The minimum number of vertices is four, which creates a triangle. +[`multipoint`](#multipoint) | MultiPoint | MULTIPOINT | An array of discrete related points that are not connected. +[`multilinestring`](#multilinestring) | MultiLineString | MULTILINESTRING | An array of linestrings. +[`multipolygon`](#multipolygon) | MultiPolygon | MULTIPOLYGON | An array of polygons. +[`geometrycollection`](#geometry-collection) | GeometryCollection | GEOMETRYCOLLECTION | A collection of xy shapes that may be of different types. +[`envelope`](#envelope) | N/A | BBOX | A bounding rectangle specified by upper-left and lower-right vertices. + +## Point + +A point is specified by a single pair of coordinates. + +Index a point in GeoJSON format: + +```json +PUT testindex/_doc/1 +{ + "location" : { + "type" : "point", + "coordinates" : [0.5, 4.5] + } +} +``` +{% include copy-curl.html %} + +Index a point in WKT format: + +```json +PUT testindex/_doc/1 +{ + "location" : "POINT (0.5 4.5)" +} +``` +{% include copy-curl.html %} + +## Linestring + +A linestring is a line specified by two or more points. If the points are collinear, the linestring is a straight line. Otherwise, the linestring represents a path made of line segments. + +Index a linestring in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "linestring", + "coordinates" : [[0.5, 4.5], [-1.5, 2.3]] + } +} +``` +{% include copy-curl.html %} + +Index a linestring in WKT format: + +```json +PUT testindex/_doc/2 +{ + "location" : "LINESTRING (0.5 4.5, -1.5 2.3)" +} +``` +{% include copy-curl.html %} + +## Polygon + +A polygon is specified by a list of vertices in coordinate form. The polygon must be closed, meaning the last point must be the same as the first point. In the following example, a triangle is created using four points. + +GeoJSON requires that you list the vertices of the polygon counterclockwise. WKT does not impose a specific order on vertices. +{: .note} + +Index a polygon (triangle) in GeoJSON format: + +```json +PUT testindex/_doc/3 +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a polygon (triangle) in WKT format: + +```json +PUT testindex/_doc/3 +{ + "location" : "POLYGON ((0.5 4.5, 2.5 6.0, 1.5 2.0, 0.5 4.5))" +} +``` +{% include copy-curl.html %} + +The polygon may have holes inside. In this case, the `coordinates` field will contain multiple arrays. The first array represents the outer polygon, and each subsequent array represents a hole. Holes are represented as polygons and specified as arrays of coordinates. + +GeoJSON requires that you list the vertices of the polygon counterclockwise and the vertices of the hole clockwise. WKT does not impose a specific order on vertices. +{: .note} + +Index a polygon (triangle) with a triangular hole in GeoJSON format: + +```json +PUT testindex/_doc/4 +{ + "location" : { + "type" : "polygon", + "coordinates" : [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]], + + [[1.0, 4.5], + [1.5, 4.5], + [1.5, 4.0], + [1.0, 4.5]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a polygon (triangle) with a triangular hole in WKT format: + +```json +PUT testindex/_doc/4 +{ + "location" : "POLYGON ((0.5 4.5, 2.5 6.0, 1.5 2.0, 0.5 4.5), (1.0 4.5, 1.5 4.5, 1.5 4.0, 1.0 4.5))" +} +``` +{% include copy-curl.html %} + +By default, the vertices of the polygon are traversed in a counterclockwise order. You can define an [`orientation`](#parameters) parameter to specify the vertex traversal order at mapping time: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "location": { + "type": "xy_shape", + "orientation" : "left" + } + } + } +} +``` +{% include copy-curl.html %} + +Subsequently indexed documents can override the `orientation` setting: + +```json +PUT testindex/_doc/3 +{ + "location" : { + "type" : "polygon", + "orientation" : "cw", + "coordinates" : [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]] + ] + } +} +``` +{% include copy-curl.html %} + +## Multipoint + +A multipoint is an array of discrete related points that are not connected. + +Index a multipoint in GeoJSON format: + +```json +PUT testindex/_doc/6 +{ + "location" : { + "type" : "multipoint", + "coordinates" : [ + [0.5, 4.5], + [2.5, 6.0] + ] + } +} +``` +{% include copy-curl.html %} + +Index a multipoint in WKT format: + +```json +PUT testindex/_doc/6 +{ + "location" : "MULTIPOINT (0.5 4.5, 2.5 6.0)" +} +``` +{% include copy-curl.html %} + +## Multilinestring + +A multilinestring is an array of linestrings. + +Index a multilinestring in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "multilinestring", + "coordinates" : [ + [[0.5, 4.5], [2.5, 6.0]], + [[1.5, 2.0], [3.5, 3.5]] + ] + } +} +``` +{% include copy-curl.html %} + +Index a linestring in WKT format: + +```json +PUT testindex/_doc/2 +{ + "location" : "MULTILINESTRING ((0.5 4.5, 2.5 6.0), (1.5 2.0, 3.5 3.5))" +} +``` +{% include copy-curl.html %} + +## Multipolygon + +A multipolygon is an array of polygons. In this example, the first polygon contains a hole, and the second does not. + +Index a multipolygon in GeoJSON format: + +```json +PUT testindex/_doc/4 +{ + "location" : { + "type" : "multipolygon", + "coordinates" : [ + [ + [[0.5, 4.5], + [2.5, 6.0], + [1.5, 2.0], + [0.5, 4.5]], + + [[1.0, 4.5], + [1.5, 4.5], + [1.5, 4.0], + [1.0, 4.5]] + ], + [ + [[2.0, 0.0], + [1.0, 2.0], + [3.0, 1.0], + [2.0, 0.0]] + ] + ] + } +} +``` +{% include copy-curl.html %} + +Index a multipolygon in WKT format: + +```json +PUT testindex/_doc/4 +{ + "location" : "MULTIPOLYGON (((0.5 4.5, 2.5 6.0, 1.5 2.0, 0.5 4.5), (1.0 4.5, 1.5 4.5, 1.5 4.0, 1.0 4.5)), ((2.0 0.0, 1.0 2.0, 3.0 1.0, 2.0 0.0)))" +} +``` +{% include copy-curl.html %} + +## Geometry collection + +A geometry collection is a collection of xy shapes that may be of different types. + +Index a geometry collection in GeoJSON format: + +```json +PUT testindex/_doc/7 +{ + "location" : { + "type": "geometrycollection", + "geometries": [ + { + "type": "point", + "coordinates": [0.5, 4.5] + }, + { + "type": "linestring", + "coordinates": [[2.5, 6.0], [1.5, 2.0]] + } + ] + } +} +``` +{% include copy-curl.html %} + +Index a geometry collection in WKT format: + +```json +PUT testindex/_doc/7 +{ + "location" : "GEOMETRYCOLLECTION (POINT (0.5 4.5), LINESTRING(2.5 6.0, 1.5 2.0))" +} +``` +{% include copy-curl.html %} + +## Envelope + +An envelope is a bounding rectangle specified by upper-left and lower-right vertices. The GeoJSON format is `[[minX, maxY], [maxX, minY]]`. + +Index an envelope in GeoJSON format: + +```json +PUT testindex/_doc/2 +{ + "location" : { + "type" : "envelope", + "coordinates" : [[3.0, 2.0], [6.0, 0.0]] + } +} +``` +{% include copy-curl.html %} + +In WKT format, use `BBOX (minX, maxY, maxX, minY)`. + +Index an envelope in WKT BBOX format: + +```json +PUT testindex/_doc/8 +{ + "location" : "BBOX (3.0, 2.0, 6.0, 0.0)" +} +``` +{% include copy-curl.html %} + +## Parameters + +The following table lists the parameters accepted by xy shape field types. All parameters are optional. + +Parameter | Description +:--- | :--- +`coerce` | A Boolean value that specifies whether to automatically close unclosed linear rings. Default is `false`. +`ignore_malformed` | A Boolean value that specifies to ignore malformed GeoJSON or WKT xy shapes and not to throw an exception. Default is `false` (throw an exception when xy shapes are malformed). +`ignore_z_value` | Specific to points with three coordinates. If `ignore_z_value` is `true`, the third coordinate is not indexed but is still stored in the _source field. If `ignore_z_value` is `false`, an exception is thrown. Default is `true`. +`orientation` | Specifies the traversal order of the vertices in the xy shape's list of coordinates. `orientation` takes the following values:
1. RIGHT: counterclockwise. Specify RIGHT orientation by using one of the following strings (uppercase or lowercase): `right`, `counterclockwise`, `ccw`.
2. LEFT: clockwise. Specify LEFT orientation by using one of the following strings (uppercase or lowercase): `left`, `clockwise`, `cw`. This value can be overridden by individual documents.
Default is `RIGHT`. \ No newline at end of file diff --git a/_field-types/xy.md b/_field-types/xy.md new file mode 100644 index 0000000000..2ab07be55e --- /dev/null +++ b/_field-types/xy.md @@ -0,0 +1,28 @@ +--- +layout: default +title: Cartesian field types +nav_order: 57 +has_children: true +has_toc: false +parent: Supported field types +redirect_from: + - /opensearch/supported-field-types/xy/ +--- + +# Cartesian field types + +Cartesian field types facilitate indexing and searching of points and shapes in a two-dimensional Cartesian coordinate system. Cartesian field types are similar to [geographic]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geographic/) field types, except they represent points and shapes on the Cartesian plane, which is not based on the Earth-fixed terrestrial reference system. Calculating distances on a plane is more efficient than calculating distances on a sphere, so distance sorting is faster for Cartesian field types. + +Cartesian field types work well for spatial applications like virtual reality, computer-aided design (CAD), and amusement park and sporting venue mapping. + +The coordinates for the Cartesian field types are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). + +The following table lists all Cartesian field types that OpenSearch supports. + +Field Data type | Description +:--- | :--- +[`xy_point`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy-point/) | A point in a two-dimensional Cartesian coordinate system, specified by x and y coordinates. +[`xy_shape`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy-shape/) | A shape, such as a polygon or a collection of xy points, in a two-dimensional Cartesian coordinate system. + +Currently, OpenSearch supports indexing and searching of Cartesian field types but not aggregations on Cartesian field types. If you'd like to see aggregations implemented, open a [GitHub issue](https://github.com/opensearch-project/geospatial). +{: .note} \ No newline at end of file diff --git a/_opensearch/data-streams.md b/_im-plugin/data-streams.md similarity index 98% rename from _opensearch/data-streams.md rename to _im-plugin/data-streams.md index 80b988cfe3..e2cbc49720 100644 --- a/_opensearch/data-streams.md +++ b/_im-plugin/data-streams.md @@ -140,6 +140,12 @@ GET _data_stream/logs-nginx/_stats } ``` +To see information about all data streams, use the following request: + +```json +GET _data_stream +``` + ### Step 3: Ingest data into the data stream To ingest data into a data stream, you can use the regular indexing APIs. Make sure every document that you index has a timestamp field. If you try to ingest a document that doesn't have a timestamp field, you get an error. diff --git a/_opensearch/index-alias.md b/_im-plugin/index-alias.md similarity index 71% rename from _opensearch/index-alias.md rename to _im-plugin/index-alias.md index 2cfdffe07e..85a3c79f8e 100644 --- a/_opensearch/index-alias.md +++ b/_im-plugin/index-alias.md @@ -1,7 +1,9 @@ --- layout: default title: Index aliases -nav_order: 12 +nav_order: 11 +redirect_from: + - /opensearch/index-alias/ --- # Index aliases @@ -57,7 +59,25 @@ You should see the following response: If this request fails, make sure the index that you're adding to the alias already exists. -To check if `alias1` refers to `index-1`, run the following command: +You can also create an alias using one of the following requests: + +```json +PUT /_aliases/ +PUT /_aliases/ +POST /_alias/ +POST /_alias/ +``` + +The `` in the above requests can be an index name, a comma-separated list of index names, or a wildcard expression. Use `_all` to refer to all indexes. + +To check if `alias1` refers to `index-1`, run one of the following commands: + +```json +GET /_alias/alias1 +GET /index-1/_alias/alias1 +``` + +To get the mappings and settings information of the indexes that the alias references, run the following command: ```json GET alias1 @@ -145,10 +165,18 @@ Conversely, to find which alias points to a specific index, run the following co GET /index-2/_alias/* ``` -To check if an alias exists, run the following command: +To get all index names and their aliases, run the following command: + +```json +GET /_alias +``` + +To check if an alias exists, run one of the following commands: ```json HEAD /alias1/_alias/ +HEAD /_alias/alias1/ +HEAD index-1/_alias/alias1/ ``` ## Add aliases at index creation @@ -200,3 +228,23 @@ Option | Valid values | Description | Required `filter` | Object | Add a filter to the alias. | No `routing` | String | Limit search to an associated shard value. You can specify `search_routing` and `index_routing` independently. | No `is_write_index` | String | Specify the index that accepts any write operations to the alias. If this value is not specified, then no write operations are allowed. | No + + +## Delete aliases + +To delete one or more aliases from an index, use the following request: + +```json +DELETE /_alias/ +DELETE /_aliases/ +``` + +Both `` and `` in the above request support comma-separated lists and wildcard expressions. Use `_all` in place of `` to delete all aliases for the indexes listed in ``. + +For example, if `alias1` refers to `index-1` and `index-2`, you can run the following command to remove `alias1` from `index-1`: + +```json +DELETE index-1/_alias/alias1 +``` + +After you run the request above, `alias1` no longer refers to `index-1`, but still refers to `index-2`. \ No newline at end of file diff --git a/_im-plugin/index-rollups/index.md b/_im-plugin/index-rollups/index.md index 57eed843b5..e2cac72911 100644 --- a/_im-plugin/index-rollups/index.md +++ b/_im-plugin/index-rollups/index.md @@ -3,8 +3,8 @@ layout: default title: Index rollups nav_order: 35 has_children: true -redirect_from: /im-plugin/index-rollups/ -has_toc: false +redirect_from: + - /im-plugin/index-rollups/ --- # Index rollups @@ -347,110 +347,486 @@ POST example_rollup/_search ```json { - "took": 476, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 + "took" : 14, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 }, - "hits": { - "total": { - "value": 281, - "relation": "eq" + "hits" : { + "total" : { + "value" : 281, + "relation" : "eq" }, - "max_score": null, - "hits": [] + "max_score" : null, + "hits" : [ ] }, - "aggregations": { - "daily_numbers": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "aggregations" : { + "daily_numbers" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Friday", - "doc_count": 53, - "total_revenue": { - "value": 4858.84375 + "key" : "Friday", + "doc_count" : 59, + "total_revenue" : { + "value" : 4858.84375 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 53, - "average quantity": { - "value": 2.305084745762712 + "key" : "Los Angeles", + "doc_count" : 59, + "average quantity" : { + "value" : 2.305084745762712 } } ] } }, { - "key": "Saturday", - "doc_count": 43, - "total_revenue": { - "value": 3547.203125 + "key" : "Saturday", + "doc_count" : 46, + "total_revenue" : { + "value" : 3547.203125 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 43, - "average quantity": { - "value": 2.260869565217391 + "key" : "Los Angeles", + "doc_count" : 46, + "average quantity" : { + "value" : 2.260869565217391 } } ] } }, { - "key": "Tuesday", - "doc_count": 42, - "total_revenue": { - "value": 3983.28125 + "key" : "Tuesday", + "doc_count" : 45, + "total_revenue" : { + "value" : 3983.28125 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 42, - "average quantity": { - "value": 2.2888888888888888 + "key" : "Los Angeles", + "doc_count" : 45, + "average quantity" : { + "value" : 2.2888888888888888 } } ] } }, { - "key": "Sunday", - "doc_count": 40, - "total_revenue": { - "value": 3308.1640625 + "key" : "Sunday", + "doc_count" : 44, + "total_revenue" : { + "value" : 3308.1640625 }, - "per_city": { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, - "buckets": [ + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { - "key": "Los Angeles", - "doc_count": 40, - "average quantity": { - "value": 2.090909090909091 + "key" : "Los Angeles", + "doc_count" : 44, + "average quantity" : { + "value" : 2.090909090909091 + } + } + ] + } + }, + { + "key" : "Thursday", + "doc_count" : 40, + "total_revenue" : { + "value" : 2876.125 + }, + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Los Angeles", + "doc_count" : 40, + "average quantity" : { + "value" : 2.3 + } + } + ] + } + }, + { + "key" : "Monday", + "doc_count" : 38, + "total_revenue" : { + "value" : 2673.453125 + }, + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Los Angeles", + "doc_count" : 38, + "average quantity" : { + "value" : 2.1578947368421053 + } + } + ] + } + }, + { + "key" : "Wednesday", + "doc_count" : 38, + "total_revenue" : { + "value" : 3202.453125 + }, + "per_city" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Los Angeles", + "doc_count" : 38, + "average quantity" : { + "value" : 2.236842105263158 } } ] } } - ... ] } } } ``` + +## The doc_count field + +The `doc_count` field in bucket aggregations contains the number of documents collected in each bucket. When calculating the bucket's `doc_count`, the number of documents is incremented by the number of the pre-aggregated documents in each summary document. The `doc_count` returned from rollup searches represents the total number of matching documents from the source index. The document count for each bucket is the same whether you search the source index or the rollup target index. + +## Query string queries + +To take advantage of shorter and more easily written strings in Query DSL, you can use [query strings]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text/query-string/) to simplify search queries in rollup indexes. To use query strings, add the following fields to your rollup search request: + +```json +"query": { + "query_string": { + "query": "field_name:field_value" + } + } +``` + +The following example uses a query string with a `*` wildcard operator to search inside a rollup index called `my_server_logs_rollup`: + +```json +GET my_server_logs_rollup/_search +{ + "size": 0, + "query": { + "query_string": { + "query": "email* OR inventory", + "default_field": "service_name" + } + }, + + "aggs": { + "service_name": { + "terms": { + "field": "service_name" + }, + "aggs": { + "region": { + "terms": { + "field": "region" + }, + "aggs": { + "average quantity": { + "avg": { + "field": "cpu_usage" + } + } + } + } + } + } + } +} +``` + +For more information about query string query parameters, see [Query string query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text/query-string/#parameters). + +## Dynamic target index + + + +In ISM rollup, the `target_index` field may contain a template that is compiled at the time of each rollup indexing. For example, if you specify the `target_index` field as `{% raw %}rollup_ndx-{{ctx.source_index}}{% endraw %}`, the source index `log-000001` will roll up into a target index `rollup_ndx-log-000001`. This allows you to roll up data into multiple time-based indexes, with one rollup job created for each source index. + +The `source_index` parameter in {% raw %}`{{ctx.source_index}}`{% endraw %} cannot contain wildcards. +{: .note} + +## Searching multiple rollup indexes + +When data is rolled up into multiple target indexes, you can run one search across all of the rollup indexes. To search multiple target indexes that have the same rollup, specify the index names as a comma-separated list or a wildcard pattern. For example, with `target_index` as `{% raw %}rollup_ndx-{{ctx.source_index}}{% endraw %}` and source indexes that start with `log`, specify the `rollup_ndx-log*` pattern. Or, to search for rolled up log-000001 and log-000002 indexes, specify the `rollup_ndx-log-000001,rollup_ndx-log-000002` list. + +You cannot search a mix of rollup and non-rollup indexes with the same query. +{: .note} + +## Example + +The following example demonstrates the `doc_count` field, dynamic index names, and searching multiple rollup indexes with the same rollup. + +**Step 1:** Add an index template for ISM to manage the rolling over of the indexes aliased by `log`: + +```json +PUT _index_template/ism_rollover +{ + "index_patterns": ["log*"], + "template": { + "settings": { + "plugins.index_state_management.rollover_alias": "log" + } + } +} +``` + +**Step 2:** Set up an ISM rollover policy to roll over any index whose name starts with `log*` after one document is uploaded to it, and then roll up the individual backing index. The target index name is dynamically generated from the source index name by prepending the string `rollup_ndx-` to the source index name. + +```json +PUT _plugins/_ism/policies/rollover_policy +{ + "policy": { + "description": "Example rollover policy.", + "default_state": "rollover", + "states": [ + { + "name": "rollover", + "actions": [ + { + "rollover": { + "min_doc_count": 1 + } + } + ], + "transitions": [ + { + "state_name": "rp" + } + ] + }, + { + "name": "rp", + "actions": [ + { + "rollup": { + "ism_rollup": { + "target_index": {% raw %}"rollup_ndx-{{ctx.source_index}}"{% endraw %}, + "description": "Example rollup job", + "page_size": 200, + "dimensions": [ + { + "date_histogram": { + "source_field": "ts", + "fixed_interval": "60m", + "timezone": "America/Los_Angeles" + } + }, + { + "terms": { + "source_field": "message.keyword" + } + } + ], + "metrics": [ + { + "source_field": "msg_size", + "metrics": [ + { + "sum": {} + } + ] + } + ] + } + } + } + ], + "transitions": [] + } + ], + "ism_template": { + "index_patterns": ["log*"], + "priority": 100 + } + } +} +``` + +**Step 3:** Create an index named `log-000001` and set up an alias `log` for it. + +```json +PUT log-000001 +{ + "aliases": { + "log": { + "is_write_index": true + } + } +} +``` + +**Step 4:** Index four documents into the index created above. Two of the documents have the message "Success", and two have the message "Error". + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T09:28:48-04:00", + "message": "Success", + "msg_size": 10 +} +``` + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T10:06:25-04:00", + "message": "Error", + "msg_size": 20 +} +``` + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T10:23:54-04:00", + "message": "Error", + "msg_size": 30 +} +``` + +```json +POST log/_doc?refresh=true +{ + "ts" : "2022-08-26T10:53:41-04:00", + "message": "Success", + "msg_size": 40 +} +``` + +Once you index the first document, the rollover action is executed. This action creates the index `log-000002` with `rollover_policy` attached to it. Then the rollup action is executed, which creates the rollup index `rollup_ndx-log-000001`. + +To monitor the status of rollover and rollup index creation, you can use the ISM explain API: `GET _plugins/_ism/explain` +{: .tip} + +**Step 5:** Search the rollup index. + +```json +GET rollup_ndx-log-*/_search +{ + "size": 0, + "query": { + "match_all": {} + }, + "aggregations": { + "message_numbers": { + "terms": { + "field": "message.keyword" + }, + "aggs": { + "per_message": { + "terms": { + "field": "message.keyword" + }, + "aggregations": { + "sum_message": { + "sum": { + "field": "msg_size" + } + } + } + } + } + } + } +} +``` + +The response contains two buckets, "Error" and "Success", and the document count for each bucket is 2: + +```json +{ + "took" : 30, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 4, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "message_numbers" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Success", + "doc_count" : 2, + "per_message" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Success", + "doc_count" : 2, + "sum_message" : { + "value" : 50.0 + } + } + ] + } + }, + { + "key" : "Error", + "doc_count" : 2, + "per_message" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "Error", + "doc_count" : 2, + "sum_message" : { + "value" : 50.0 + } + } + ] + } + } + ] + } + } +} +``` \ No newline at end of file diff --git a/_im-plugin/index-rollups/rollup-api.md b/_im-plugin/index-rollups/rollup-api.md index 8099d792f5..17ad0580ab 100644 --- a/_im-plugin/index-rollups/rollup-api.md +++ b/_im-plugin/index-rollups/rollup-api.md @@ -91,7 +91,7 @@ You can specify the following options. Options | Description | Type | Required :--- | :--- |:--- |:--- | `source_index` | The name of the detector. | String | Yes -`target_index` | Specify the target index that the rolled up data is ingested into. You could either create a new target index or use an existing index. The target index cannot be a combination of raw and rolled up data. | String | Yes +`target_index` | Specify the target index that the rolled up data is ingested into. You can either create a new target index or use an existing index. The target index cannot be a combination of raw and rolled up data. This field supports dynamically generated index names like {% raw %}`rollup_{{ctx.source_index}}`{% endraw %}, where `source_index` cannot contain wildcards. | String | Yes `schedule` | Schedule of the index rollup job which can be an interval or a cron expression. | Object | Yes `schedule.interval` | Specify the frequency of execution of the rollup job. | Object | No `schedule.interval.start_time` | Start time of the interval. | Timestamp | Yes diff --git a/_opensearch/index-templates.md b/_im-plugin/index-templates.md similarity index 99% rename from _opensearch/index-templates.md rename to _im-plugin/index-templates.md index b979b3e341..015c990822 100644 --- a/_opensearch/index-templates.md +++ b/_im-plugin/index-templates.md @@ -1,7 +1,9 @@ --- layout: default title: Index templates -nav_order: 15 +nav_order: 6 +redirect_from: + - /opensearch/index-templates/ --- # Index templates diff --git a/_im-plugin/index.md b/_im-plugin/index.md index 809d4775e8..f7cd370e9c 100644 --- a/_im-plugin/index.md +++ b/_im-plugin/index.md @@ -1,14 +1,285 @@ --- layout: default -title: About Index Management +title: Managing indexes nav_order: 1 has_children: false redirect_from: - /im-plugin/ + - /opensearch/index-data/ --- -# About Index Management +# Managing indexes OpenSearch Dashboards {: .label .label-yellow :} -The Index Management (IM) plugin lets you automate recurring index management activities and reduce storage costs. +You index data using the OpenSearch REST API. Two APIs exist: the index API and the `_bulk` API. + +For situations in which new data arrives incrementally (for example, customer orders from a small business), you might use the index API to add documents individually as they arrive. For situations in which the flow of data is less frequent (for example, weekly updates to a marketing website), you might prefer to generate a file and send it to the `_bulk` API. For large numbers of documents, lumping requests together and using the `_bulk` API offers superior performance. If your documents are enormous, however, you might need to index them individually. + + +## Introduction to indexing + +Before you can search data, you must *index* it. Indexing is the method by which search engines organize data for fast retrieval. The resulting structure is called, fittingly, an index. + +In OpenSearch, the basic unit of data is a JSON *document*. Within an index, OpenSearch identifies each document using a unique ID. + +A request to the index API looks like this: + +```json +PUT /_doc/ +{ "A JSON": "document" } +``` + +A request to the `_bulk` API looks a little different, because you specify the index and ID in the bulk data: + +```json +POST _bulk +{ "index": { "_index": "", "_id": "" } } +{ "A JSON": "document" } +``` + +Bulk data must conform to a specific format, which requires a newline character (`\n`) at the end of every line, including the last line. This is the basic format: + +``` +Action and metadata\n +Optional document\n +Action and metadata\n +Optional document\n +``` + +The document is optional, because `delete` actions don't require a document. The other actions (`index`, `create`, and `update`) all require a document. If you specifically want the action to fail if the document already exists, use the `create` action instead of the `index` action. +{: .note } + +To index bulk data using the `curl` command, navigate to the folder where you have your file saved and run the following command: + +```json +curl -H "Content-Type: application/x-ndjson" -POST https://localhost:9200/data/_bulk -u 'admin:admin' --insecure --data-binary "@data.json" +``` + +If any one of the actions in the `_bulk` API fail, OpenSearch continues to execute the other actions. Examine the `items` array in the response to figure out what went wrong. The entries in the `items` array are in the same order as the actions specified in the request. + +OpenSearch automatically creates an index when you add a document to an index that doesn't already exist. It also automatically generates an ID if you don't specify an ID in the request. This simple example automatically creates the movies index, indexes the document, and assigns it a unique ID: + +```json +POST movies/_doc +{ "title": "Spirited Away" } +``` + +Automatic ID generation has a clear downside: because the indexing request didn't specify a document ID, you can't easily update the document at a later time. Also, if you run this request 10 times, OpenSearch indexes this document as 10 different documents with unique IDs. To specify an ID of 1, use the following request (note the use of PUT instead of POST): + +```json +PUT movies/_doc/1 +{ "title": "Spirited Away" } +``` + +Because you must specify an ID, if you run this command 10 times, you still have just one document indexed with the `_version` field incremented to 10. + +Indexes default to one primary shard and one replica. If you want to specify non-default settings, create the index before adding documents: + +```json +PUT more-movies +{ "settings": { "number_of_shards": 6, "number_of_replicas": 2 } } +``` + +## Naming restrictions for indexes + +OpenSearch indexes have the following naming restrictions: + +- All letters must be lowercase. +- Index names can't begin with underscores (`_`) or hyphens (`-`). +- Index names can't contain spaces, commas, or the following characters: + + `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` + +## Read data + +After you index a document, you can retrieve it by sending a GET request to the same endpoint that you used for indexing: + +```json +GET movies/_doc/1 + +{ + "_index" : "movies", + "_type" : "_doc", + "_id" : "1", + "_version" : 1, + "_seq_no" : 0, + "_primary_term" : 1, + "found" : true, + "_source" : { + "title" : "Spirited Away" + } +} +``` + +You can see the document in the `_source` object. If the document is not found, the `found` key is `false` and the `_source` object is not part of the response. + +To retrieve multiple documents with a single command, use the `_mget` operation. +The format for retrieving multiple documents is similar to the `_bulk` operation, where you must specify the index and ID in the request body: + +```json +GET _mget +{ + "docs": [ + { + "_index": "", + "_id": "" + }, + { + "_index": "", + "_id": "" + } + ] +} +``` + +To only return specific fields in a document: + +```json +GET _mget +{ + "docs": [ + { + "_index": "", + "_id": "", + "_source": "field1" + }, + { + "_index": "", + "_id": "", + "_source": "field2" + } + ] +} +``` + +To check if a document exists: + +```json +HEAD movies/_doc/ +``` + +If the document exists, you get back a `200 OK` response, and if it doesn't, you get back a `404 - Not Found` error. + +## Update data + +To update existing fields or to add new fields, send a POST request to the `_update` operation with your changes in a `doc` object: + +```json +POST movies/_update/1 +{ + "doc": { + "title": "Castle in the Sky", + "genre": ["Animation", "Fantasy"] + } +} +``` + +Note the updated `title` field and new `genre` field: + +```json +GET movies/_doc/1 + +{ + "_index" : "movies", + "_type" : "_doc", + "_id" : "1", + "_version" : 2, + "_seq_no" : 1, + "_primary_term" : 1, + "found" : true, + "_source" : { + "title" : "Castle in the Sky", + "genre" : [ + "Animation", + "Fantasy" + ] + } +} +``` + +The document also has an incremented `_version` field. Use this field to keep track of how many times a document is updated. + +POST requests make partial updates to documents. To altogether replace a document, use a PUT request: + +```json +PUT movies/_doc/1 +{ + "title": "Spirited Away" +} +``` + +The document with ID of 1 will contain only the `title` field, because the entire document will be replaced with the document indexed in this PUT request. + +Use the `upsert` object to conditionally update documents based on whether they already exist. Here, if the document exists, its `title` field changes to `Castle in the Sky`. If it doesn't, OpenSearch indexes the document in the `upsert` object. + +```json +POST movies/_update/2 +{ + "doc": { + "title": "Castle in the Sky" + }, + "upsert": { + "title": "Only Yesterday", + "genre": ["Animation", "Fantasy"], + "date": 1993 + } +} +``` + +### Example response + +```json +{ + "_index" : "movies", + "_type" : "_doc", + "_id" : "2", + "_version" : 2, + "result" : "updated", + "_shards" : { + "total" : 2, + "successful" : 1, + "failed" : 0 + }, + "_seq_no" : 3, + "_primary_term" : 1 +} +``` + +Each update operation for a document has a unique combination of the `_seq_no` and `_primary_term` values. + +OpenSearch first writes your updates to the primary shard and then sends this change to all the replica shards. An uncommon issue can occur if multiple users of your OpenSearch-based application make updates to existing documents in the same index. In this situation, another user can read and update a document from a replica before it receives your update from the primary shard. Your update operation then ends up updating an older version of the document. In the best case, you and the other user make the same changes, and the document remains accurate. In the worst case, the document now contains out-of-date information. + +To prevent this situation, use the `_seq_no` and `_primary_term` values in the request header: + +```json +POST movies/_update/2?if_seq_no=3&if_primary_term=1 +{ + "doc": { + "title": "Castle in the Sky", + "genre": ["Animation", "Fantasy"] + } +} +``` + +If the document is updated after we retrieved it, the `_seq_no` and `_primary_term` values are different and our update operation fails with a `409 — Conflict` error. + +When using the `_bulk` API, specify the `_seq_no` and `_primary_term` values within the action metadata. + +## Delete data + +To delete a document from an index, use a DELETE request: + +```json +DELETE movies/_doc/1 +``` + +The DELETE operation increments the `_version` field. If you add the document back to the same ID, the `_version` field increments again. This behavior occurs because OpenSearch deletes the document `_source`, but retains its metadata. + + +## Next steps + +- The Index Management (IM) plugin lets you automate recurring index management activities and reduce storage costs. For more information, see [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index). + +- For instructions on how to reindex data, see [Reindex data]({{site.url}}{{site.baseurl}}/im-plugin/reindex-data/). + diff --git a/_im-plugin/ism/api.md b/_im-plugin/ism/api.md index 6e0eaf2882..441f737e6f 100644 --- a/_im-plugin/ism/api.md +++ b/_im-plugin/ism/api.md @@ -546,3 +546,150 @@ DELETE _plugins/_ism/policies/policy_1 "_primary_term": 1 } ``` + +## Error prevention validation +Introduced 2.4 +{: .label .label-purple } + +ISM allows you to run an action automatically. However, running an action can fail for a variety of reasons. You can use error prevention validation to test an action in order to rule out failures. + +To enable error prevention validation, set the `plugins.index_state_management.validation_service.enabled` setting to `true`: + +```bash +PUT _cluster/settings +{ + "persistent":{ + "plugins.index_state_management.validation_action.enabled": true + } +} +``` + +#### Example response + +```json +{ + "acknowledged" : true, + "persistent" : { + "plugins" : { + "index_state_management" : { + "validation_action" : { + "enabled" : "true" + } + } + } + }, + "transient" : { } +} +``` + +To check an error prevention validation status and message, pass `validate_action=true` to the `_plugins/_ism/explain` endpoint: + +```bash +GET _plugins/_ism/explain/test-000001?validate_action=true +``` + +#### Example response + +The response contains an additional validate object with a validation message and status: + +```json +{ + "test-000001" : { + "index.plugins.index_state_management.policy_id" : "test_rollover", + "index.opendistro.index_state_management.policy_id" : "test_rollover", + "index" : "test-000001", + "index_uuid" : "CgKsxFmQSIa8dWqpbSJmyA", + "policy_id" : "test_rollover", + "policy_seq_no" : -2, + "policy_primary_term" : 0, + "rolled_over" : false, + "index_creation_date" : 1667410460649, + "state" : { + "name" : "rollover", + "start_time" : 1667410766045 + }, + "action" : { + "name" : "rollover", + "start_time" : 1667411127803, + "index" : 0, + "failed" : false, + "consumed_retries" : 0, + "last_retry_time" : 0 + }, + "step" : { + "name" : "attempt_rollover", + "start_time" : 1667411127803, + "step_status" : "starting" + }, + "retry_info" : { + "failed" : true, + "consumed_retries" : 0 + }, + "info" : { + "message" : "Previous action was not able to update IndexMetaData." + }, + "enabled" : false, + "validate" : { + "validation_message" : "Missing rollover_alias index setting [index=test-000001]", + "validation_status" : "re_validating" + } + }, + "total_managed_indices" : 1 +} +``` + +If you pass `validate_action=false` or do not pass a `validate_action` value to the `_plugins/_ism/explain` endpoint, the response will not contain an error prevention validation status and message: + +```bash +GET _plugins/_ism/explain/test-000001?validate_action=false +``` + +Or: + +```bash +GET _plugins/_ism/explain/test-000001 +``` + +#### Example response + +```json +{ + "test-000001" : { + "index.plugins.index_state_management.policy_id" : "test_rollover", + "index.opendistro.index_state_management.policy_id" : "test_rollover", + "index" : "test-000001", + "index_uuid" : "CgKsxFmQSIa8dWqpbSJmyA", + "policy_id" : "test_rollover", + "policy_seq_no" : -2, + "policy_primary_term" : 0, + "rolled_over" : false, + "index_creation_date" : 1667410460649, + "state" : { + "name" : "rollover", + "start_time" : 1667410766045 + }, + "action" : { + "name" : "rollover", + "start_time" : 1667411127803, + "index" : 0, + "failed" : false, + "consumed_retries" : 0, + "last_retry_time" : 0 + }, + "step" : { + "name" : "attempt_rollover", + "start_time" : 1667411127803, + "step_status" : "starting" + }, + "retry_info" : { + "failed" : true, + "consumed_retries" : 0 + }, + "info" : { + "message" : "Previous action was not able to update IndexMetaData." + }, + "enabled" : false + }, + "total_managed_indices" : 1 +} +``` \ No newline at end of file diff --git a/_im-plugin/ism/error-prevention/api.md b/_im-plugin/ism/error-prevention/api.md new file mode 100644 index 0000000000..a273d25cfb --- /dev/null +++ b/_im-plugin/ism/error-prevention/api.md @@ -0,0 +1,156 @@ +--- +layout: default +title: ISM Error Prevention API +parent: ISM Error Prevention +grand_parent: Index State Management +nav_order: 10 +--- + +# ISM Error Prevention API + +The ISM Error Prevention API allows you to enable Index State Management (ISM) error prevention and check the validation status and message. + +## Enable error prevention validation + +You can configure error prevention validation by setting the `plugins.index_state_management.validation_service.enabled` parameter. + +#### Example request + +```bash +PUT _cluster/settings +{ + "persistent":{ + "plugins.index_state_management.validation_action.enabled": true + } +} +``` + +#### Example response + +```json +{ + "acknowledged" : true, + "persistent" : { + "plugins" : { + "index_state_management" : { + "validation_action" : { + "enabled" : "true" + } + } + } + }, + "transient" : { } +} +``` + +## Check validation status and message via the Explain API + +Pass the `validate_action=true` path parameter in the Explain API URI to see the validation status and message. + +#### Example request + +```bash +GET _plugins/_ism/explain/test-000001?validate_action=true +``` + +#### Example response + +```json +{ + "test-000001" : { + "index.plugins.index_state_management.policy_id" : "test_rollover", + "index.opendistro.index_state_management.policy_id" : "test_rollover", + "index" : "test-000001", + "index_uuid" : "CgKsxFmQSIa8dWqpbSJmyA", + "policy_id" : "test_rollover", + "policy_seq_no" : -2, + "policy_primary_term" : 0, + "rolled_over" : false, + "index_creation_date" : 1667410460649, + "state" : { + "name" : "rollover", + "start_time" : 1667410766045 + }, + "action" : { + "name" : "rollover", + "start_time" : 1667411127803, + "index" : 0, + "failed" : false, + "consumed_retries" : 0, + "last_retry_time" : 0 + }, + "step" : { + "name" : "attempt_rollover", + "start_time" : 1667411127803, + "step_status" : "starting" + }, + "retry_info" : { + "failed" : true, + "consumed_retries" : 0 + }, + "info" : { + "message" : "Previous action was not able to update IndexMetaData." + }, + "enabled" : false, + "validate" : { + "validation_message" : "Missing rollover_alias index setting [index=test-000001]", + "validation_status" : "re_validating" + } + }, + "total_managed_indices" : 1 +} +``` + +If you pass the parameter without a value or false, then it doesn't return the validation status and message. Only if you pass `validate_action=true` will the response will return the validation status and message. + +#### Example request + +```bash +GET _plugins/_ism/explain/test-000001?validate_action=false + --- OR --- +GET _plugins/_ism/explain/test-000001 +``` + +#### Example response + +```json +{ + "test-000001" : { + "index.plugins.index_state_management.policy_id" : "test_rollover", + "index.opendistro.index_state_management.policy_id" : "test_rollover", + "index" : "test-000001", + "index_uuid" : "CgKsxFmQSIa8dWqpbSJmyA", + "policy_id" : "test_rollover", + "policy_seq_no" : -2, + "policy_primary_term" : 0, + "rolled_over" : false, + "index_creation_date" : 1667410460649, + "state" : { + "name" : "rollover", + "start_time" : 1667410766045 + }, + "action" : { + "name" : "rollover", + "start_time" : 1667411127803, + "index" : 0, + "failed" : false, + "consumed_retries" : 0, + "last_retry_time" : 0 + }, + "step" : { + "name" : "attempt_rollover", + "start_time" : 1667411127803, + "step_status" : "starting" + }, + "retry_info" : { + "failed" : true, + "consumed_retries" : 0 + }, + "info" : { + "message" : "Previous action was not able to update IndexMetaData." + }, + "enabled" : false + }, + "total_managed_indices" : 1 +} +``` diff --git a/_im-plugin/ism/error-prevention/index.md b/_im-plugin/ism/error-prevention/index.md new file mode 100644 index 0000000000..08654fcb5d --- /dev/null +++ b/_im-plugin/ism/error-prevention/index.md @@ -0,0 +1,70 @@ +--- +layout: default +title: ISM Error Prevention +nav_order: 90 +has_children: true +has_toc: false +redirect_from: + - /im-plugin/ism/error-prevention/ + - /im-plugin/ism/error-prevention/index/ +--- + +# ISM error prevention + +Error prevention validates Index State Management (ISM) actions before they are performed in order to prevent actions from failing. It also outputs additional information from the action validation results in the response of the [Index Explain API]({{site.url}}{{site.baseurl}}/im-plugin/ism/api/#explain-index). Validation rules and troubleshooting of each action are listed in the following sections. + +--- + +#### Table of contents +1. TOC +{:toc} + + +--- + +## rollover + +ISM does not perform a `rollover` action for an index under any of these conditions: + +- [The index is not the write index]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/resolutions/#the-index-is-not-the-write-index). +- [The index does not have an alias]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/resolutions/#the-index-does-not-have-an-alias). +- [The rollover policy does not contain a rollover_alias index setting]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/resolutions/#the-rollover-policy-misses-rollover_alias-index-setting). +- [Skipping of a rollover action has occured]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/resolutions/#skipping-rollover-action-is-true). +- [The index has already been rolled over using the alias successfully]({{site.url}}{{site.baseurl}}/im-plugin/ism/error-prevention/resolutions/#this-index-has-already-been-rolled-over-successfully). + +## delete + +ISM does not perform a `delete` action for an index under any of these conditions: + +- The index does not exist. +- The index name is invalid. +- The index is the write index for a data stream. + +## force_merge + +ISM does not perform a `force_merge` action for an index if its dataset is too large and exceeds the threshold. + +## replica_count + +ISM does not perform a `replica_count` action for an index under any of these conditions: + +- The amount of data exceeds the threshold. +- The number of shards exceeds the maximum. + +## open + +ISM does not perform an `open` action for an index under any of these conditions: + +- The index is blocked. +- The number of shards exceeds the maximum. + +## read_only + +ISM does not perform a `read_only` action for an index under any of these conditions: + +- The index is blocked. +- The amount of data exceeds the threshold. + +## read_write + +ISM does not perform a `read_write` action for an index if the index is blocked. diff --git a/_im-plugin/ism/error-prevention/resolutions.md b/_im-plugin/ism/error-prevention/resolutions.md new file mode 100644 index 0000000000..5502691fb3 --- /dev/null +++ b/_im-plugin/ism/error-prevention/resolutions.md @@ -0,0 +1,229 @@ +--- +layout: default +title: ISM Error Prevention resolutions +parent: ISM Error Prevention +grand_parent: Index State Management +nav_order: 5 +--- + +# ISM error prevention resolutions + +Resolutions of errors for each validation rule action are listed in the following sections. + +--- + +#### Table of contents +1. TOC +{:toc} + + +--- + +## The index is not the write index + +To confirm that the index is a write index, run the following request: + +```bash +GET /_alias?pretty +``` + +If the response does not contain `"is_write_index"` : true, the index is not a write index. The following example confirms that the index is a write index: + +```json +{ + "" : { + "aliases" : { + "" : { + "is_write_index" : true + } + } + } +} +``` + +To set the index as a write index, run the following request: + +```bash +PUT +{ + "aliases": { + "" : { + "is_write_index" : true + } + } +} +``` + +## The index does not have an alias + +If the index does not have an alias, you can add one by running the following request: + +```bash +POST _aliases +{ + "actions": [ + { + "add": { + "index": "", + "alias": "" + } + } + ] +} +``` + +## Skipping rollover action is true + +In the event that skipping a rollover action occurs, run the following request: + +```bash + GET /_settings?pretty +``` + +If you receive the response in the first example, you can reset it by running the request in the second example: + +```json +{ + "index": { + "opendistro.index_state_management.rollover_skip": true + } +} +``` + +```bash +PUT /_settings +{ + "index": { + "index_state_management.rollover_skip": false + } +} +``` + +## This index has already been rolled over successfully + +Remove the [rollover policy from the index]({{site.url}}{{site.baseurl}}/im-plugin/ism/api/#remove-policy-from-index) to prevent this error from reoccurring. + +## The rollover policy misses rollover_alias index setting + +Add a `rollover_alias` index setting to the rollover policy to resolve this issue. Run the following request: + +```bash +PUT _index_template/ism_rollover +{ + "index_patterns": [""], + "template": { + "settings": { + "plugins.index_state_management.rollover_alias": "" + } + } +} +``` + +## Data too large and exceeding the threshold + +Check the [JVM information]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-info/) and increase the heap memory. + +## Maximum shards exceeded + +The shard limit per node, or per index, causes this issue to occur. Check whether there is a `total_shards_per_node` limit by running the following request: + +```bash +GET /_cluster/settings +``` + +If the response contains `total_shards_per_node`, increase its value temporarily by running the following request: + +```bash +PUT _cluster/settings +{ + "transient":{ + "cluster.routing.allocation.total_shards_per_node":100 + } +} +``` + +To check whether there is a shard limit for an index, run the following request: + +```bash +GET /_settings/index.routing- +``` + +If the response contains the setting in the first example, increase its value or set it to `-1` for unlimited shards, as shown in the second example: + +```json +"index" : { + "routing" : { + "allocation" : { + "total_shards_per_node" : "10" + } + } + } +``` + +```bash +PUT /_settings +{"index.routing.allocation.total_shards_per_node":-1} +``` + +## The index is a write index for some data stream + +If you still want to delete the index, check your [data stream]({{site.url}}{{site.baseurl}}/opensearch/data-streams/) settings and change the write index. + +## The index is blocked + +Generally, the index is blocked because disk usage has exceeded the flood-stage watermark and the index has a `read-only-allow-delete` block. To resolve this issue, you can: + +1. Remove the `-index.blocks.read_only_allow_delete-` parameter. +1. Temporarily increase the disk watermarks. +1. Temporarily disable the disk allocation threshold. + +To prevent the issue from reoccurring, it is better to reduce the usage of the disk by increasing disk space, adding new nodes, or removing data or indexes that are no longer needed. + +Remove `-index.blocks.read_only_allow_delete-` by running the following request: + +```bash +PUT /_settings +{ + "index.blocks.read_only_allow_delete": null +} +``` + +Increase the low disk watermarks by running the following request: + +```bash +PUT _cluster/settings +{ + "transient": { + "cluster": { + "routing": { + "allocation": { + "disk": { + "watermark": { + "low": "25.0gb" + } + } + } + } + } + } +} +``` + +Disable the disk allocation threshold by running the following request: + +```bash +PUT _cluster/settings +{ + "transient": { + "cluster": { + "routing": { + "allocation": { + "disk": { + "threshold_enabled" : false + } + } + } + } + } +} +``` \ No newline at end of file diff --git a/_im-plugin/ism/index.md b/_im-plugin/ism/index.md index 2849c82f5a..9d16c20c56 100644 --- a/_im-plugin/ism/index.md +++ b/_im-plugin/ism/index.md @@ -1,7 +1,7 @@ --- layout: default title: Index State Management -nav_order: 3 +nav_order: 16 has_children: true redirect_from: - /im-plugin/ism/ @@ -39,9 +39,9 @@ You can use the visual editor or JSON editor to create policies. Compared to the 2. Choose **Create policy**. 3. Choose **Visual editor**. 4. In the **Policy info** section, enter a policy ID and an optional description. -5. In the **Error notification** section, set up an optional error notification that gets sent whenever a policy execution fails. If you're using auto rollovers in your policy, we recommend setting up error notifications, which notify you of unexpectedly large indexes if rollovers fail. +5. In the **Error notification** section, set up an optional error notification that gets sent whenever a policy execution fails. For more information, see [Error notifications]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies#error-notifications). If you're using auto rollovers in your policy, we recommend setting up error notifications, which notify you of unexpectedly large indexes if rollovers fail. 6. In **ISM templates**, enter any ISM template patterns to automatically apply this policy to future indexes. For example, if you specify a template of `sample-index*`, the ISM plugin automatically applies this policy to any indexes whose names start with `sample-index`. Your pattern cannot contain any of the following characters: `:`, `"`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, and `<`. -7. In **States**, add any states you want to include in the policy. Each state has actions the plugin executes when the index enters a certain state, and transitions, which have conditions that, when met, transition the index into a destination state. The first state you create in a policy is automatically set as the initial state. Each policy must have at least one state, but actions and transitions are optional. +7. In **States**, add any states you want to include in the policy. Each state has [actions]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/#actions) the plugin executes when the index enters a certain state, and [transitions]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies/#transitions), which have conditions that, when met, transition the index into a destination state. The first state you create in a policy is automatically set as the initial state. Each policy must have at least one state, but actions and transitions are optional. 8. Choose **Create**. @@ -76,6 +76,8 @@ PUT _plugins/_ism/policies/policy_id If you have more than one template that matches an index pattern, ISM uses the priority value to determine which template to apply. +For an example ISM template policy, see [Sample policy with ISM template for auto rollover]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies#sample-policy-with-ism-template-for-auto-rollover). + Older versions of the plugin include the `policy_id` in an index template, so when an index is created that matches the index template pattern, the index will have the policy attached to it: ```json @@ -115,7 +117,7 @@ ISM does not run jobs if the cluster state is red. 1. Choose **Managed indexes**. 2. To change your policy, see [Change Policy]({{site.url}}{{site.baseurl}}/im-plugin/ism/managedindexes#change-policy). 3. To attach a rollover alias to your index, select your policy and choose **Add rollover alias**. -Make sure that the alias that you enter already exists. +Make sure that the alias that you enter already exists. For more information about the rollover operation, see [rollover]({{site.url}}{{site.baseurl}}/im-plugin/ism/policies#rollover). 4. To remove a policy, choose your policy, and then choose **Remove policy**. 5. To retry a policy, choose your policy, and then choose **Retry policy**. diff --git a/_im-plugin/ism/managedindexes.md b/_im-plugin/ism/managedindexes.md index bb5d4bda26..7b098a1b4a 100644 --- a/_im-plugin/ism/managedindexes.md +++ b/_im-plugin/ism/managedindexes.md @@ -1,7 +1,7 @@ --- layout: default title: Managed Indices -nav_order: 5 +nav_order: 3 parent: Index State Management has_children: false --- diff --git a/_im-plugin/ism/policies.md b/_im-plugin/ism/policies.md index 0d16dc8b93..baa4378fd5 100644 --- a/_im-plugin/ism/policies.md +++ b/_im-plugin/ism/policies.md @@ -789,6 +789,128 @@ If you want to skip rollovers for an index, set `index.plugins.index_state_manag GET _plugins/_ism/explain/log-000001?pretty ``` +## Example policy with ISM templates for the alias action + +The following example policy is for an alias action use case. + +In the following example, the first job will trigger the rollover action, and a new index will be created. Next, another document is added to the two indexes. The new job will then cause the second index to point to the log alias, and the older index will be removed due to the alias action. + +First, create an ISM policy: + +```json +PUT /_plugins/_ism/policies/rollover_policy?pretty +{ + "policy": { + "description": "Example rollover policy.", + "default_state": "rollover", + "states": [ + { + "name": "rollover", + "actions": [ + { + "rollover": { + "min_doc_count": 1 + } + } + ], + "transitions": [{ + "state_name": "alias", + "conditions": { + "min_doc_count": "2" + } + }] + }, + { + "name": "alias", + "actions": [ + { + "alias": { + "actions": [ + { + "remove": { + "alias": "log" + } + } + ] + } + } + ] + } + ], + "ism_template": { + "index_patterns": ["log*"], + "priority": 100 + } + } +} +``` + +Next, create an index template on which to enable the policy: + +```json +PUT /_index_template/ism_rollover? +{ + "index_patterns": ["log*"], + "template": { + "settings": { + "plugins.index_state_management.rollover_alias": "log" + } + } +} +``` +{% include copy-curl.html %} + +Next, change the cluster settings to trigger jobs every minute: + +```json +PUT /_cluster/settings?pretty=true +{ + "persistent" : { + "plugins.index_state_management.job_interval" : 1 + } +} +``` +{% include copy-curl.html %} + +Next, create a new index: + +```json +PUT /log-000001 +{ + "aliases": { + "log": { + "is_write_index": true + } + } +} +``` +{% include copy-curl.html %} + +Finally, add a document to the index to trigger the job: + +```json +POST /log-000001/_doc +{ + "message": "dummy" +} +``` +{% include copy-curl.html %} + +You can verify these steps using the Alias and Index API: + +```json +GET /_cat/indices?pretty +``` +{% include copy-curl.html %} + +```json +GET /_cat/aliases?pretty +``` +{% include copy-curl.html %} + +Note: The `index` and `remove_index` parameters are not allowed with alias action policies. Only the `add` and `remove` alias action parameters are allowed. +{: .warning } + ## Example policy The following example policy implements a `hot`, `warm`, and `delete` workflow. You can use this policy as a template to prioritize resources to your indexes based on their levels of activity. diff --git a/_opensearch/reindex-data.md b/_im-plugin/reindex-data.md similarity index 99% rename from _opensearch/reindex-data.md rename to _im-plugin/reindex-data.md index 166eece64c..fcb127a649 100644 --- a/_opensearch/reindex-data.md +++ b/_im-plugin/reindex-data.md @@ -1,7 +1,9 @@ --- layout: default title: Reindex data -nav_order: 16 +nav_order: 15 +redirect_from: + - /opensearch/reindex-data/ --- # Reindex data diff --git a/_im-plugin/security.md b/_im-plugin/security.md index fffe74be15..0fb7dbeb35 100644 --- a/_im-plugin/security.md +++ b/_im-plugin/security.md @@ -9,7 +9,7 @@ has_children: false Using the security plugin with index management lets you limit non-admin users to certain actions. For example, you might want to set up your security such that a group of users can only read ISM policies, while others can create, delete, or change policies. -All index management data are protected as system indices, and only a super admin or an admin with a Transport Layer Security (TLS) certificate can access system indices. For more information, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indices). +All index management data are protected as system indices, and only a super admin or an admin with a Transport Layer Security (TLS) certificate can access system indices. For more information, see [System indices]({{site.url}}{{site.baseurl}}/security/configuration/system-indices). ## Basic permissions diff --git a/_opensearch/index-data.md b/_opensearch/index-data.md deleted file mode 100644 index 39dee1325b..0000000000 --- a/_opensearch/index-data.md +++ /dev/null @@ -1,271 +0,0 @@ ---- -layout: default -title: Index data -nav_order: 10 ---- - -# Index data - -You index data using the OpenSearch REST API. Two APIs exist: the index API and the `_bulk` API. - -For situations in which new data arrives incrementally (for example, customer orders from a small business), you might use the index API to add documents individually as they arrive. For situations in which the flow of data is less frequent (for example, weekly updates to a marketing website), you might prefer to generate a file and send it to the `_bulk` API. For large numbers of documents, lumping requests together and using the `_bulk` API offers superior performance. If your documents are enormous, however, you might need to index them individually. - - -## Introduction to indexing - -Before you can search data, you must *index* it. Indexing is the method by which search engines organize data for fast retrieval. The resulting structure is called, fittingly, an index. - -In OpenSearch, the basic unit of data is a JSON *document*. Within an index, OpenSearch identifies each document using a unique ID. - -A request to the index API looks like this: - -```json -PUT /_doc/ -{ "A JSON": "document" } -``` - -A request to the `_bulk` API looks a little different, because you specify the index and ID in the bulk data: - -```json -POST _bulk -{ "index": { "_index": "", "_id": "" } } -{ "A JSON": "document" } -``` - -Bulk data must conform to a specific format, which requires a newline character (`\n`) at the end of every line, including the last line. This is the basic format: - -``` -Action and metadata\n -Optional document\n -Action and metadata\n -Optional document\n -``` - -The document is optional, because `delete` actions don't require a document. The other actions (`index`, `create`, and `update`) all require a document. If you specifically want the action to fail if the document already exists, use the `create` action instead of the `index` action. -{: .note } - -To index bulk data using the `curl` command, navigate to the folder where you have your file saved and run the following command: - -```json -curl -H "Content-Type: application/x-ndjson" -POST https://localhost:9200/data/_bulk -u 'admin:admin' --insecure --data-binary "@data.json" -``` - -If any one of the actions in the `_bulk` API fail, OpenSearch continues to execute the other actions. Examine the `items` array in the response to figure out what went wrong. The entries in the `items` array are in the same order as the actions specified in the request. - -OpenSearch automatically creates an index when you add a document to an index that doesn't already exist. It also automatically generates an ID if you don't specify an ID in the request. This simple example automatically creates the movies index, indexes the document, and assigns it a unique ID: - -```json -POST movies/_doc -{ "title": "Spirited Away" } -``` - -Automatic ID generation has a clear downside: because the indexing request didn't specify a document ID, you can't easily update the document at a later time. Also, if you run this request 10 times, OpenSearch indexes this document as 10 different documents with unique IDs. To specify an ID of 1, use the following request (note the use of PUT instead of POST): - -```json -PUT movies/_doc/1 -{ "title": "Spirited Away" } -``` - -Because you must specify an ID, if you run this command 10 times, you still have just one document indexed with the `_version` field incremented to 10. - -Indexes default to one primary shard and one replica. If you want to specify non-default settings, create the index before adding documents: - -```json -PUT more-movies -{ "settings": { "number_of_shards": 6, "number_of_replicas": 2 } } -``` - -## Naming restrictions for indexes - -OpenSearch indexes have the following naming restrictions: - -- All letters must be lowercase. -- Index names can't begin with underscores (`_`) or hyphens (`-`). -- Index names can't contain spaces, commas, or the following characters: - - `:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<` - -## Read data - -After you index a document, you can retrieve it by sending a GET request to the same endpoint that you used for indexing: - -```json -GET movies/_doc/1 - -{ - "_index" : "movies", - "_type" : "_doc", - "_id" : "1", - "_version" : 1, - "_seq_no" : 0, - "_primary_term" : 1, - "found" : true, - "_source" : { - "title" : "Spirited Away" - } -} -``` - -You can see the document in the `_source` object. If the document is not found, the `found` key is `false` and the `_source` object is not part of the response. - -To retrieve multiple documents with a single command, use the `_mget` operation. -The format for retrieving multiple documents is similar to the `_bulk` operation, where you must specify the index and ID in the request body: - -```json -GET _mget -{ - "docs": [ - { - "_index": "", - "_id": "" - }, - { - "_index": "", - "_id": "" - } - ] -} -``` - -To only return specific fields in a document: - -```json -GET _mget -{ - "docs": [ - { - "_index": "", - "_id": "", - "_source": "field1" - }, - { - "_index": "", - "_id": "", - "_source": "field2" - } - ] -} -``` - -To check if a document exists: - -```json -HEAD movies/_doc/ -``` - -If the document exists, you get back a `200 OK` response, and if it doesn't, you get back a `404 - Not Found` error. - -## Update data - -To update existing fields or to add new fields, send a POST request to the `_update` operation with your changes in a `doc` object: - -```json -POST movies/_update/1 -{ - "doc": { - "title": "Castle in the Sky", - "genre": ["Animation", "Fantasy"] - } -} -``` - -Note the updated `title` field and new `genre` field: - -```json -GET movies/_doc/1 - -{ - "_index" : "movies", - "_type" : "_doc", - "_id" : "1", - "_version" : 2, - "_seq_no" : 1, - "_primary_term" : 1, - "found" : true, - "_source" : { - "title" : "Castle in the Sky", - "genre" : [ - "Animation", - "Fantasy" - ] - } -} -``` - -The document also has an incremented `_version` field. Use this field to keep track of how many times a document is updated. - -POST requests make partial updates to documents. To altogether replace a document, use a PUT request: - -```json -PUT movies/_doc/1 -{ - "title": "Spirited Away" -} -``` - -The document with ID of 1 will contain only the `title` field, because the entire document will be replaced with the document indexed in this PUT request. - -Use the `upsert` object to conditionally update documents based on whether they already exist. Here, if the document exists, its `title` field changes to `Castle in the Sky`. If it doesn't, OpenSearch indexes the document in the `upsert` object. - -```json -POST movies/_update/2 -{ - "doc": { - "title": "Castle in the Sky" - }, - "upsert": { - "title": "Only Yesterday", - "genre": ["Animation", "Fantasy"], - "date": 1993 - } -} -``` - -#### Example response - -```json -{ - "_index" : "movies", - "_type" : "_doc", - "_id" : "2", - "_version" : 2, - "result" : "updated", - "_shards" : { - "total" : 2, - "successful" : 1, - "failed" : 0 - }, - "_seq_no" : 3, - "_primary_term" : 1 -} -``` - -Each update operation for a document has a unique combination of the `_seq_no` and `_primary_term` values. - -OpenSearch first writes your updates to the primary shard and then sends this change to all the replica shards. An uncommon issue can occur if multiple users of your OpenSearch-based application make updates to existing documents in the same index. In this situation, another user can read and update a document from a replica before it receives your update from the primary shard. Your update operation then ends up updating an older version of the document. In the best case, you and the other user make the same changes, and the document remains accurate. In the worst case, the document now contains out-of-date information. - -To prevent this situation, use the `_seq_no` and `_primary_term` values in the request header: - -```json -POST movies/_update/2?if_seq_no=3&if_primary_term=1 -{ - "doc": { - "title": "Castle in the Sky", - "genre": ["Animation", "Fantasy"] - } -} -``` - -If the document is updated after we retrieved it, the `_seq_no` and `_primary_term` values are different and our update operation fails with a `409 — Conflict` error. - -When using the `_bulk` API, specify the `_seq_no` and `_primary_term` values within the action metadata. - -## Delete data - -To delete a document from an index, use a DELETE request: - -```json -DELETE movies/_doc/1 -``` - -The DELETE operation increments the `_version` field. If you add the document back to the same ID, the `_version` field increments again. This behavior occurs because OpenSearch deletes the document `_source`, but retains its metadata. diff --git a/_opensearch/logs.md b/_opensearch/logs.md deleted file mode 100644 index 57851372c3..0000000000 --- a/_opensearch/logs.md +++ /dev/null @@ -1,174 +0,0 @@ ---- -layout: default -title: Logs -nav_order: 60 ---- - -# Logs - -The OpenSearch logs include valuable information for monitoring cluster operations and troubleshooting issues. The location of the logs differs based on the installation type: - -- On Docker, OpenSearch writes most logs to the console and stores the remainder in `opensearch/logs/`. The tarball installation also uses `opensearch/logs/`. -- On most Linux installations, OpenSearch writes logs to `/var/log/opensearch/`. - -Logs are available as `.log` (plain text) and `.json` files. Permissions for the OpenSearch logs are `-rw-r--r--` by default, meaning that any user account on the node can read them. You can change this behavior _for each log type_ in `log4j2.properties` using the `filePermissions` option. For example, you might add `appender.rolling.filePermissions = rw-r-----` to change permissions for the JSON server log. For details, see the [Log4j 2 documentation](https://logging.apache.org/log4j/2.x/manual/appenders.html#RollingFileAppender). - - -## Application logs - -For its application logs, OpenSearch uses [Apache Log4j 2](https://logging.apache.org/log4j/2.x/) and its built-in log levels (from least to most severe) of TRACE, DEBUG, INFO, WARN, ERROR, and FATAL. The default OpenSearch log level is INFO. - -Rather than changing the default log level (`logger.level`), you change the log level for individual OpenSearch modules: - -```json -PUT /_cluster/settings -{ - "persistent" : { - "logger.org.opensearch.index.reindex" : "DEBUG" - } -} -``` - -The easiest way to identify modules is not from the logs, which abbreviate the path (for example, `o.o.i.r`), but from the [OpenSearch source code](https://github.com/opensearch-project/opensearch/tree/master/server/src/main/java/org/opensearch). -{: .tip } - -After this sample change, OpenSearch emits much more detailed logs during reindex operations: - -``` -[2019-10-18T16:52:51,184][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: starting -[2019-10-18T16:52:51,186][DEBUG][o.o.i.r.TransportReindexAction] [node1] executing initial scroll against [some-index] -[2019-10-18T16:52:51,291][DEBUG][o.o.i.r.TransportReindexAction] [node1] scroll returned [3] documents with a scroll id of [DXF1Z==] -[2019-10-18T16:52:51,292][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: got scroll response with [3] hits -[2019-10-18T16:52:51,294][DEBUG][o.o.i.r.WorkerBulkByScrollTaskState] [node1] [1626]: preparing bulk request for [0s] -[2019-10-18T16:52:51,297][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: preparing bulk request -[2019-10-18T16:52:51,299][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: sending [3] entry, [222b] bulk request -[2019-10-18T16:52:51,310][INFO ][o.e.c.m.MetaDataMappingService] [node1] [some-new-index/R-j3adc6QTmEAEb-eAie9g] create_mapping [_doc] -[2019-10-18T16:52:51,383][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: got scroll response with [0] hits -[2019-10-18T16:52:51,384][DEBUG][o.o.i.r.WorkerBulkByScrollTaskState] [node1] [1626]: preparing bulk request for [0s] -[2019-10-18T16:52:51,385][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: preparing bulk request -[2019-10-18T16:52:51,386][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: finishing without any catastrophic failures -[2019-10-18T16:52:51,395][DEBUG][o.o.i.r.TransportReindexAction] [node1] Freed [1] contexts -``` - -The DEBUG and TRACE levels are extremely verbose. If you enable either one to troubleshoot a problem, disable it after you finish. - -There are other ways to change log levels: - -1. Add lines to `opensearch.yml`: - - ```yml - logger.org.opensearch.index.reindex: debug - ``` - - Modifying `opensearch.yml` makes the most sense if you want to reuse your logging configuration across multiple clusters or debug startup issues with a single node. - -2. Modify `log4j2.properties`: - - ``` - # Define a new logger with unique ID of reindex - logger.reindex.name = org.opensearch.index.reindex - # Set the log level for that ID - logger.reindex.level = debug - ``` - - This approach is extremely flexible, but requires familiarity with the [Log4j 2 property file syntax](https://logging.apache.org/log4j/2.x/manual/configuration.html#Properties). In general, the other options offer a simpler configuration experience. - - If you examine the default `log4j2.properties` file in the configuration directory, you can see a few OpenSearch-specific variables: - - ``` - appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n - appender.rolling_old.fileName = ${sys:os.logs.base_path}${sys:file.separator}${sys:os.logs.cluster_name}.log - ``` - - - `${sys:os.logs.base_path}` is the directory for logs (for example, `/var/log/opensearch/`). - - `${sys:os.logs.cluster_name}` is the name of the cluster. - - `[%node_name]` is the name of the node. - - -## Slow logs - -OpenSearch has two *slow logs*, logs that help you identify performance issues: the search slow log and the indexing slow log. - -These logs rely on thresholds to define what qualifies as a "slow" search or indexing operation. For example, you might decide that a query is slow if it takes more than 15 seconds to complete. Unlike application logs, which you configure for modules, you configure slow logs for indices. By default, both logs are disabled (all thresholds are set to `-1`): - -```json -GET /_settings?include_defaults=true - -{ - "indexing": { - "slowlog": { - "reformat": "true", - "threshold": { - "index": { - "warn": "-1", - "trace": "-1", - "debug": "-1", - "info": "-1" - } - }, - "source": "1000", - "level": "TRACE" - } - }, - "search": { - "slowlog": { - "level": "TRACE", - "threshold": { - "fetch": { - "warn": "-1", - "trace": "-1", - "debug": "-1", - "info": "-1" - }, - "query": { - "warn": "-1", - "trace": "-1", - "debug": "-1", - "info": "-1" - } - } - } - } -} -``` - -To enable these logs, increase one or more thresholds: - -```json -PUT /_settings -{ - "indexing": { - "slowlog": { - "threshold": { - "index": { - "warn": "15s", - "trace": "750ms", - "debug": "3s", - "info": "10s" - } - }, - "source": "500", - "level": "INFO" - } - } -} -``` - -In this example, OpenSearch logs indexing operations that take 15 seconds or longer at the WARN level and operations that take between 10 and 14.*x* seconds at the INFO level. If you set a threshold to 0 seconds, OpenSearch logs all operations, which can be useful for testing whether slow logs are indeed enabled. - -- `reformat` specifies whether to log the document `_source` field as a single line (`true`) or let it span multiple lines (`false`). -- `source` is the number of characters of the document `_source` field to log. -- `level` is the minimum log level to include. - -A line from `opensearch_index_indexing_slowlog.log` might look like this: - -``` -node1 | [2019-10-24T19:48:51,012][WARN][i.i.s.index] [node1] [some-index/i86iF5kyTyy-PS8zrdDeAA] took[3.4ms], took_millis[3], type[_doc], id[1], routing[], source[{"title":"Your Name", "Director":"Makoto Shinkai"}] -``` - -Slow logs can consume considerable disk space if you set thresholds or levels too low. Consider enabling them temporarily for troubleshooting or performance tuning. To disable slow logs, return all thresholds to `-1`. - - -## Deprecation logs - -Deprecation logs record when clients make deprecated API calls to your cluster. These logs can help you identify and fix issues prior to upgrading to a new major version. By default, OpenSearch logs deprecated API calls at the WARN level, which works well for almost all use cases. If desired, configure `logger.deprecation.level` using `_cluster/settings`, `opensearch.yml`, or `log4j2.properties`. diff --git a/_opensearch/ux.md b/_opensearch/ux.md deleted file mode 100644 index 65cc50a759..0000000000 --- a/_opensearch/ux.md +++ /dev/null @@ -1,1069 +0,0 @@ ---- -layout: default -title: Search experience -nav_order: 55 ---- - -# Search experience - -Expectations from search engines have evolved over the years. Just returning relevant results quickly is no longer enough for most users. OpenSearch includes many features that enhance the user’s search experience as follows: - -Feature | Description -:--- | :--- -Autocomplete queries | Suggest phrases as the user types. -Paginate results | Rather than a single, long list, break search results into pages. -Scroll search | Return a large number of results in batches. -Sort results | Allow sorting results by different criteria. -Highlight query matches | Highlight the search term in the results. - ---- - -## Autocomplete queries - -Autocomplete shows suggestions to users while they type. - -For example, if a user types "pop," OpenSearch provides suggestions like "popcorn" or "popsicles." These suggestions preempt your user's intention and lead them to a possible search term more quickly. - -OpenSearch lets you design autocomplete that updates with each keystroke, provides a few relevant suggestions, and tolerates typos. - -Implement autocomplete using one of three methods: - -- Prefix matching -- Edge N-gram matching -- Completion suggesters - -These methods are described in the following sections. - -### Prefix matching - -Prefix matching finds documents that matches the last term in the query string. - -For example, assume that the user types “qui” into a search UI. To autocomplete this phrase, use the `match_phrase_prefix` query to search all `text_entry` fields that begin with the prefix "qui." -To make the word order and relative positions flexible, specify a `slop` value. To learn about the `slop` option, see the [Multi-match query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text#multi-match). - -#### Sample Request - -```json -GET shakespeare/_search -{ - "query": { - "match_phrase_prefix": { - "text_entry": { - "query": "qui", - "slop": 3 - } - } - } -} -``` - -Prefix matching doesn’t require any special mappings. It works with your data as-is. -However, it’s a fairly resource-intensive operation. A prefix of `a` could match hundreds of thousands of terms and not be useful to your user. - -To limit the impact of prefix expansion, set `max_expansions` to a reasonable number. To learn about the `max_expansions` option, see [Advanced filter options]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text#advanced-filter-options). - -#### Sample Request - -```json -GET shakespeare/_search -{ - "query": { - "match_phrase_prefix": { - "text_entry": { - "query": "qui", - "slop": 3, - "max_expansions": 10 - } - } - } -} -``` - -The ease of implementing query-time autocomplete comes at the cost of performance. -When implementing this feature on a large scale, we recommend an index-time solution. With an index-time solution, you might experience slower indexing, but it’s a price you pay only once and not for every query. The edge N-gram and completion suggester methods are index time. - -### Edge N-gram matching - -During indexing, edge N-grams chop up a word into a sequence of N characters to support a faster lookup of partial search terms. - -If you N-gram the word "quick," the results depend on the value of N. - -N | Type | N-gram -:--- | :--- | :--- -1 | Unigram | [ `q`, `u`, `i`, `c`, `k` ] -2 | Bigram | [ `qu`, `ui`, `ic`, `ck` ] -3 | Trigram | [ `qui`, `uic`, `ick` ] -4 | Four-gram | [ `quic`, `uick` ] -5 | Five-gram | [ `quick` ] - -Autocomplete needs only the beginning N-grams of a search phrase, so OpenSearch uses a special type of N-gram called edge N-gram. - -Edge N-gramming the word "quick" results in the following: - -- `q` -- `qu` -- `qui` -- `quic` -- `quick` - -This follows the same sequence the user types. - -To configure a field to use edge N-grams, create an autocomplete analyzer with an `edge_ngram` filter: - -#### Sample Request - -```json -PUT shakespeare -{ - "mappings": { - "properties": { - "text_entry": { - "type": "text", - "analyzer": "autocomplete" - } - } - }, - "settings": { - "analysis": { - "filter": { - "edge_ngram_filter": { - "type": "edge_ngram", - "min_gram": 1, - "max_gram": 20 - } - }, - "analyzer": { - "autocomplete": { - "type": "custom", - "tokenizer": "standard", - "filter": [ - "lowercase", - "edge_ngram_filter" - ] - } - } - } - } -} -``` - -This example creates the index and instantiates the edge N-gram filter and analyzer. - -The `edge_ngram_filter` produces edge N-grams with a minimum N-gram length of 1 (a single letter) and a maximum length of 20. So it offers suggestions for words of up to 20 letters. - -The `autocomplete` analyzer tokenizes a string into individual terms, lowercases the terms, and then produces edge N-grams for each term using the `edge_ngram_filter`. - -Use the `analyze` operation to test this analyzer: - -```json -POST shakespeare/_analyze -{ - "analyzer": "autocomplete", - "text": "quick" -} -``` - -It returns edge N-grams as tokens: - -* `q` -* `qu` -* `qui` -* `quic` -* `quick` - -Use the `standard` analyzer at search time. Otherwise, the search query splits into edge N-grams and you get results for everything that matches `q`, `u`, and `i`. -This is one of the few occasions where you use a different analyzer on the index and query side. - -#### Sample Request - -```json -GET shakespeare/_search -{ - "query": { - "match": { - "text_entry": { - "query": "qui", - "analyzer": "standard" - } - } - } -} -``` - -#### Sample Response - -```json -{ - "took": 5, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 533, - "relation": "eq" - }, - "max_score": 9.712725, - "hits": [ - { - "_index": "shakespeare", - "_id": "22006", - "_score": 9.712725, - "_source": { - "type": "line", - "line_id": 22007, - "play_name": "Antony and Cleopatra", - "speech_number": 12, - "line_number": "5.2.44", - "speaker": "CLEOPATRA", - "text_entry": "Quick, quick, good hands." - } - }, - { - "_index": "shakespeare", - "_id": "54665", - "_score": 9.712725, - "_source": { - "type": "line", - "line_id": 54666, - "play_name": "Loves Labours Lost", - "speech_number": 21, - "line_number": "5.1.52", - "speaker": "HOLOFERNES", - "text_entry": "Quis, quis, thou consonant?" - } - } - ] - } -} -``` - -Alternatively, specify the `search_analyzer` in the mapping itself: - -```json -"mappings": { - "properties": { - "text_entry": { - "type": "text", - "analyzer": "autocomplete", - "search_analyzer": "standard" - } - } -} -``` - -### Completion suggester - -The completion suggester accepts a list of suggestions and builds them into a finite-state transducer (FST), an optimized data structure that’s essentially a graph. This data structure lives in memory and is optimized for fast prefix lookups. To learn more about FSTs, see [Wikipedia](https://en.wikipedia.org/wiki/Finite-state_transducer). - -As the user types, the completion suggester moves through the FST graph one character at a time along a matching path. After it runs out of user input, it examines the remaining endings to produce a list of suggestions. - -The completion suggester makes your autocomplete solution as efficient as possible and lets you have explicit control over its suggestions. - -Use a dedicated field type called `completion`, which stores the FST-like data structures in the index: - -```json -PUT shakespeare -{ - "mappings": { - "properties": { - "text_entry": { - "type": "completion" - } - } - } -} -``` - -To get back suggestions, use the `search` endpoint with the `suggest` parameter: - -```json -GET shakespeare/_search -{ - "suggest": { - "autocomplete": { - "prefix": "To be", - "completion": { - "field": "text_entry" - } - } - } -} -``` - -The phrase "to be" is prefix matched with the FST of the `text_entry` field. - -#### Sample Response - -```json -{ - "took": 9, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "text_entry": [ - { - "text": "To be", - "offset": 0, - "length": 5, - "options": [ - { - "text": "To be a comrade with the wolf and owl,--", - "_index": "shakespeare", - "_id": "50652", - "_score": 1, - "_source": { - "type": "line", - "line_id": 50653, - "play_name": "King Lear", - "speech_number": 68, - "line_number": "2.4.230", - "speaker": "KING LEAR", - "text_entry": "To be a comrade with the wolf and owl,--" - } - }, - { - "text": "To be a make-peace shall become my age:", - "_index": "shakespeare", - "_id": "78566", - "_score": 1, - "_source": { - "type": "line", - "line_id": 78567, - "play_name": "Richard II", - "speech_number": 20, - "line_number": "1.1.160", - "speaker": "JOHN OF GAUNT", - "text_entry": "To be a make-peace shall become my age:" - } - } - ] - } - ] - } -} -``` - -To specify the number of suggestions that you want to return, use the `size` parameter: - -```json -GET shakespeare/_search -{ - "suggest": { - "autocomplete": { - "prefix": "To m", - "completion": { - "field": "text_entry", - "size": 3 - } - } - } -} -``` - -#### Sample Response - -```json -{ - "took": 3, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "text_entry": [ - { - "text": "To m", - "offset": 0, - "length": 5, - "options": [ - { - "text": "To make a bastard and a slave of me!", - "_index": "shakespeare", - "_id": "5369", - "_score": 4, - "_source": { - "type": "line", - "line_id": 5370, - "play_name": "Henry VI Part 1", - "speech_number": 2, - "line_number": "4.5.15", - "speaker": "JOHN TALBOT", - "text_entry": "To make a bastard and a slave of me!" - } - }, - { - "text": "To make a bloody supper in the Tower.", - "_index": "shakespeare", - "_id": "12504", - "_score": 4, - "_source": { - "type": "line", - "line_id": 12505, - "play_name": "Henry VI Part 3", - "speech_number": 40, - "line_number": "5.5.85", - "speaker": "CLARENCE", - "text_entry": "To make a bloody supper in the Tower." - } - } - ] - } - ] - } -} -``` - -The `suggest` parameter finds suggestions using only prefix matching. -For example, you don't get back "To be, or not to be," which you might want as a suggestion. -To work around this issue, manually add curated suggestions and add weights to prioritize your suggestions. - -Index a document with an input suggestion and assign a weight: - -```json -PUT shakespeare/_doc/1 -{ - "text": "To m", - "text_entry": { - "input": [ - "To be, or not to be: that is the question:" - ], - "weight": 10 - } -} -``` - -Perform the same search as before: - -```json -GET shakespeare/_search -{ - "suggest": { - "autocomplete": { - "prefix": "To m", - "completion": { - "field": "text_entry", - "size": 3 - } - } - } -} -``` - -You see the indexed document as the first result: - -```json -{ - "took": 1021, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "autocomplete": [ - { - "text": "To m", - "offset": 0, - "length": 5, - "options": [ - { - "text": "To be, or not to be: that is the question:", - "_index": "shakespeare", - "_id": "1", - "_score": 30, - "_source": { - "text": "To me", - "text_entry": { - "input": [ - "To be, or not to be: that is the question:" - ], - "weight": 10 - } - } - }, - { - "text": "To make a bastard and a slave of me!", - "_index": "shakespeare", - "_id": "5369", - "_score": 4, - "_source": { - "type": "line", - "line_id": 5370, - "play_name": "Henry VI Part 1", - "speech_number": 2, - "line_number": "4.5.15", - "speaker": "JOHN TALBOT", - "text_entry": "To make a bastard and a slave of me!" - } - } - ] - } - ] - } -} -``` - -Use the `term` suggester to suggest corrected spellings for individual words. -The `term` suggester uses an edit distance to compute suggestions. Edit distance is the number of characters that need to be changed for a term to match. - -In this example, the user misspells a search term: - -```json -GET shakespeare/_search -{ - "suggest": { - "spell-check": { - "text": "lief", - "term": { - "field": "text_entry" - } - } - } -} -``` - -The `term` suggester returns a list of corrections: - -```json -{ - "took": 48, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "spell-check": [ - { - "text": "lief", - "offset": 0, - "length": 4, - "options": [ - { - "text": "lifes", - "score": 0.8, - "freq": 21 - }, - { - "text": "life", - "score": 0.75, - "freq": 805 - }, - { - "text": "lives", - "score": 0.6, - "freq": 187 - }, - { - "text": "liege", - "score": 0.6, - "freq": 138 - }, - { - "text": "lived", - "score": 0.6, - "freq": 80 - } - ] - } - ] - } -} -``` - -The higher the score, the better the suggestion is. The frequency represents the number of times the term appears in the documents of that index. - -To implement a "Did you mean `suggestion`?" feature, use a `phrase` suggester. -The `phrase` suggester is similar to the `term` suggester, except that it uses N-gram language models to suggest whole phrases instead of individual words. - -Create a custom analyzer called `trigram` that uses a `shingle` filter. This filter is similar to the `edge_ngram` filter, but it applies to words instead of letters: - -```json -PUT shakespeare -{ - "settings": { - "index": { - "analysis": { - "analyzer": { - "trigram": { - "type": "custom", - "tokenizer": "standard", - "filter": [ - "lowercase", - "shingle" - ] - } - }, - "filter": { - "shingle": { - "type": "shingle", - "min_shingle_size": 2, - "max_shingle_size": 3 - } - } - } - } - }, - "mappings": { - "properties": { - "text_entry": { - "type": "text", - "fields": { - "trigram": { - "type": "text", - "analyzer": "trigram" - } - } - } - } - } -} -``` - -This example includes as incorrect phrase: - -```json -POST shakespeare/_search -{ - "suggest": { - "text": "That the qution", - "simple_phrase": { - "phrase": { - "field": "text_entry.trigram" - } - } - } -} -``` - -You get back the corrected phrase: - -```json -{ - "took": 3, - "timed_out": false, - "_shards": { - "total": 1, - "successful": 1, - "skipped": 0, - "failed": 0 - }, - "hits": { - "total": { - "value": 0, - "relation": "eq" - }, - "max_score": null, - "hits": [] - }, - "suggest": { - "simple_phrase": [ - { - "text": "That the qution", - "offset": 0, - "length": 18, - "options": [ - { - "text": "that is the question", - "score": 0.0015543294 - } - ] - } - ] - } -} -``` - - -## Paginate results - -The `from` and `size` parameters return results to your users one page at a time. - -The `from` parameter is the document number that you want to start showing the results from. The `size` parameter is the number of results that you want to show. Together, they let you return a subset of the search results. - -For example, if the value of `size` is 10 and the value of `from` is 0, you see the first 10 results. If you change the value of `from` to 10, you see the next 10 results (because the results are zero-indexed). So, if you want to see results starting from result 11, `from` must be 10. - -```json -GET shakespeare/_search -{ - "from": 0, - "size": 10, - "query": { - "match": { - "play_name": "Hamlet" - } - } -} -``` - -To calculate the `from` parameter relative to the page number: - -```json -from = size * (page_number - 1) -``` - -Each time the user chooses the next page of the results, your application needs to make the same search query with an incremented `from` value. - -You can also specify the `from` and `size` parameters in the search URI: - -```json -GET shakespeare/_search?from=0&size=10 -``` - -If you only specify the `size` parameter, the `from` parameter defaults to 0. - -Querying for pages deep in your results can have a significant performance impact, so OpenSearch limits this approach to 10,000 results. - -The `from` and `size` parameters are stateless, so the results are based on the latest available data. -This can cause inconsistent pagination. -For example, assume a user stays on the first page of the results for a minute and then navigates to the second page; in that time, a new document is indexed in the background which is relevant enough to show up on the first page. In this scenario, the last result of the first page is pushed to the second page, so the user ends up seeing a result on the second page that they already saw on the first page. - -Use the `scroll` operation for consistent pagination. The `scroll` operation keeps a search context open for a certain period of time. Any data changes do not affect the results during this time. - -## Scroll search - -The `from` and `size` parameters allow you to paginate your search results, but with a limit of 10,000 results at a time. - -If you need to request massive volumes of data from, for example, a machine learning job, use the `scroll` operation instead. The `scroll` operation allows you to request an unlimited number of results. - -To use the scroll operation, add a `scroll` parameter to the request header with a search context to tell OpenSearch how long you need to keep scrolling. This search context needs to be long enough to process a single batch of results. - -To set the number of results that you want returned for each batch, use the `size` parameter: - -```json -GET shakespeare/_search?scroll=10m -{ - "size": 10000 -} -``` - -OpenSearch caches the results and returns a scroll ID to access them in batches: - -```json -"_scroll_id" : "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAUWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ==" -``` - -Pass this scroll ID to the `scroll` operation to get back the next batch of results: - -```json -GET _search/scroll -{ - "scroll": "10m", - "scroll_id": "DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAUWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ==" -} -``` - -Using this scroll ID, you get results in batches of 10,000 as long as the search context is still open. Typically, the scroll ID does not change between requests, but it *can* change, so make sure to always use the latest scroll ID. If you don't send the next scroll request within the set search context, the `scroll` operation does not return any results. - -If you expect billions of results, use a sliced scroll. Slicing allows you to perform multiple scroll operations for the same request, but in parallel. -Set the ID and the maximum number of slices for the scroll: - -```json -GET shakespeare/_search?scroll=10m -{ - "slice": { - "id": 0, - "max": 10 - }, - "query": { - "match_all": {} - } -} -``` - -With a single scroll ID, you get back 10 results. -You can have up to 10 IDs. -Perform the same command with ID equal to 1: - -```json -GET shakespeare/_search?scroll=10m -{ - "slice": { - "id": 1, - "max": 10 - }, - "query": { - "match_all": {} - } -} -``` - -Close the search context when you’re done scrolling, because it continues to consume computing resources until the timeout: - -```json -DELETE _search/scroll/DXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAcWdmpUZDhnRFBUcWFtV21nMmFwUGJEQQ== -``` - -#### Sample Response - -```json -{ - "succeeded": true, - "num_freed": 1 -} -``` - -To close all open scroll contexts: - -```json -DELETE _search/scroll/_all -``` - -The `scroll` operation corresponds to a specific timestamp. It doesn't consider documents added after that timestamp as potential results. - -Because open search contexts consume a lot of memory, we suggest you don't use the `scroll` operation for frequent user queries that don't need the search context open. Instead, use the `sort` parameter with the `search_after` parameter to scroll responses for user queries. - -## Sort results - -Sorting allows your users to sort the results in a way that’s most meaningful to them. - -By default, full-text queries sort results by the relevance score. -You can choose to sort the results by any field value in either ascending or descending order. - -For example, to sort results by descending order of a `line_id` value: - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "line_id": { - "order": "desc" - } - } - ] -} -``` - -The sort parameter is an array, so you can specify multiple field values in the order of their priority. - -If you have two fields with the same value for `line_id`, OpenSearch uses `speech_number`, which is the second option for sorting. - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "line_id": { - "order": "desc" - } - }, - { - "speech_number": { - "order": "desc" - } - } - ] -} -``` - -You can continue to sort by any number of field values to get the results in just the right order. It doesn’t have to be a numerical value---you can also sort by date or timestamp fields: - -```json -"sort": [ - { - "date": { - "order": "desc" - } - } - ] -``` - -For numeric fields that contain an array of numbers, you can sort by `avg`, `sum`, and `median` modes: - -```json -"sort": [ - { - "price": { - "order": "asc", - "mode": "avg" - } - } -] -``` - -To sort by the minimum or maximum values, use the `min` or `max` modes. These modes work for both numeric and string data types. - -A text field that’s analyzed cannot be used to sort documents, because the inverted index only contains the individual tokenized terms and not the entire string. So, you cannot sort by the `play_name`, for example. - -One workaround is map a raw version of the text field as a keyword type, so it won’t be analyzed and you have a copy of the full original version for sorting purposes. - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "play_name.keyword": { - "order": "desc" - } - } - ] -} -``` - -You get back results sorted by the `play_name` field in alphabetic order. - -Use `sort` with `search_after` parameter for more efficient scrolling. -You get back results after the values you specify in the `search_after` array. - -Make sure you have the same number of values in the `search_after` array as in the `sort` array, also ordered in the same way. -In this case, you get back results after `line_id = 3202` and `speech_number = 8`. - -```json -GET shakespeare/_search -{ - "query": { - "term": { - "play_name": { - "value": "Henry IV" - } - } - }, - "sort": [ - { - "line_id": { - "order": "desc" - } - }, - { - "speech_number": { - "order": "desc" - } - } - ], - "search_after": [ - "3202", - "8" - ] -} -``` - -## Highlight query matches - -Highlighting emphasizes the search term(s) in the results. - -To highlight the search terms, add a `highlight` parameter outside of the query block: - -```json -GET shakespeare/_search -{ - "query": { - "match": { - "text_entry": "life" - } - }, - "highlight": { - "fields": { - "text_entry": {} - } - } -} -``` - -For each document in the results, you get back a `highlight` object that shows your search term wrapped in an `em` tag: - -```json -"highlight": { - "text_entry": [ - "my life, except my life." - ] -} -``` - -Design your application code to parse the results from the `highlight` object and perform some action on the search terms, such as changing their color, bolding, italicizing, and so on. - -To change the default `em` tags, use the `pretag` and `posttag` parameters: - -```json -GET shakespeare/_search?format=yaml -{ - "query": { - "match": { - "play_name": "Henry IV" - } - }, - "highlight": { - "pre_tags": [ - "" - ], - "post_tags": [ - "" - ], - "fields": { - "play_name": {} - } - } -} -``` - -The `highlight` parameter highlights the original terms even when using synonyms or stemming for the search itself. diff --git a/_opensearch/aggregations.md b/_query-dsl/aggregations/aggregations.md similarity index 98% rename from _opensearch/aggregations.md rename to _query-dsl/aggregations/aggregations.md index 38305997b0..63e99e47dd 100644 --- a/_opensearch/aggregations.md +++ b/_query-dsl/aggregations/aggregations.md @@ -1,8 +1,11 @@ --- layout: default title: Aggregations -nav_order: 14 has_children: true +nav_order: 5 +permalink: /aggregations/ +redirect_from: + - /opensearch/aggregations/ --- # Aggregations diff --git a/_opensearch/bucket-agg.md b/_query-dsl/aggregations/bucket-agg.md similarity index 86% rename from _opensearch/bucket-agg.md rename to _query-dsl/aggregations/bucket-agg.md index 3833b4b484..6154893f2c 100644 --- a/_opensearch/bucket-agg.md +++ b/_query-dsl/aggregations/bucket-agg.md @@ -1,9 +1,11 @@ --- layout: default -title: Bucket Aggregations +title: Bucket aggregations parent: Aggregations -nav_order: 2 -has_children: false +permalink: /aggregations/bucket-agg/ +nav_order: 3 +redirect_from: + - /opensearch/bucket-agg/ --- # Bucket aggregations @@ -12,7 +14,7 @@ Bucket aggregations categorize sets of documents as buckets. The type of bucket You can use bucket aggregations to implement faceted navigation (usually placed as a sidebar on a search result landing page) to help your users narrow down the results. -## terms +## Terms The `terms` aggregation dynamically creates a bucket for each unique term of a field. @@ -74,6 +76,207 @@ The `terms` aggregation requests each shard for its top 3 unique terms. The coor This is especially true if `size` is set to a low number. Because the default size is 10, an error is unlikely to happen. If you don’t need high accuracy and want to increase the performance, you can reduce the size. +### Account for pre-aggregated data + +While the `doc_count` field provides a representation of the number of individual documents aggregated in a bucket, `doc_count` by itself does not have a way to correctly increment documents that store pre-aggregated data. To account for pre-aggregated data and accurately calculate the number of documents in a bucket, you can use the `_doc_count` field to add the number of documents in a single summary field. When a document includes the `_doc_count` field, all bucket aggregations recognize its value and increase the bucket `doc_count` cumulatively. Keep these considerations in mind when using the `_doc_count` field: + +* The field does not support nested arrays; only positive integers can be used. +* If a document does not contain the `_doc_count` field, aggregation uses the document to increase the count by 1. + +OpenSearch features that rely on an accurate document count illustrate the importance of using the `_doc_count` field. To see how this field can be used to support other search tools, refer to [Index rollups](https://opensearch.org/docs/latest/im-plugin/index-rollups/index/), an OpenSearch feature for the Index Management (IM) plugin that stores documents with pre-aggregated data in rollup indexes. +{: .tip} + +### Example usage + +```json +PUT /my_index/_doc/1 +{ + "response_code": 404, + "date":"2022-08-05", + "_doc_count": 20 +} + +PUT /my_index/_doc/2 +{ + "response_code": 404, + "date":"2022-08-06", + "_doc_count": 10 +} + +PUT /my_index/_doc/3 +{ + "response_code": 200, + "date":"2022-08-06", + "_doc_count": 300 +} + +GET /my_index/_search +{ + "size": 0, + "aggs": { + "response_codes": { + "terms": { + "field" : "response_code" + } + } + } +} +``` + +#### Example response + +```json +{ + "took" : 20, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "response_codes" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : 200, + "doc_count" : 300 + }, + { + "key" : 404, + "doc_count" : 30 + } + ] + } + } +} +``` + +## Multi-terms + +Similar to the `terms` bucket aggregation, you can also search for multiple terms using the `multi_terms` aggregation. Multi-terms aggregations are useful when you need to sort by document count, or when you need to sort by a metric aggregation on a composite key and get the top `n` results. For example, you could search for a specific number of documents (e.g., 1000) and the number of servers per location that show CPU usage greater than 90%. The top number of results would be returned for this multi-term query. + +The `multi_terms` aggregation does consume more memory than a `terms` aggregation, so its performance might be slower. +{: .tip } + +### Multi-terms aggregation parameters + +Parameter | Description +:--- | :--- +multi_terms | Indicates a multi-terms aggregation that gathers buckets of documents together based on criteria specified by multiple terms. +size | Specifies the number of buckets to return. Default is 10. +order | Indicates the order to sort the buckets. By default, buckets are ordered according to document count per bucket. If the buckets contain the same document count, then `order` can be explicitly set to the term value instead of document count. (e.g., set `order` to "max-cpu"). +doc_count | Specifies the number of documents to be returned in each bucket. By default, the top 10 terms are returned. + +#### Sample Request + +```json +GET sample-index100/_search +{ + "size": 0, + "aggs": { + "hot": { + "multi_terms": { + "terms": [{ + "field": "region" + },{ + "field": "host" + }], + "order": {"max-cpu": "desc"} + }, + "aggs": { + "max-cpu": { "max": { "field": "cpu" } } + } + } + } +} +``` + +#### Sample Response + +```json +{ + "took": 118, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 8, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "multi-terms": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": [ + "dub", + "h1" + ], + "key_as_string": "dub|h1", + "doc_count": 2, + "max-cpu": { + "value": 90.0 + } + }, + { + "key": [ + "dub", + "h2" + ], + "key_as_string": "dub|h2", + "doc_count": 2, + "max-cpu": { + "value": 70.0 + } + }, + { + "key": [ + "iad", + "h2" + ], + "key_as_string": "iad|h2", + "doc_count": 2, + "max-cpu": { + "value": 50.0 + } + }, + { + "key": [ + "iad", + "h1" + ], + "key_as_string": "iad|h1", + "doc_count": 2, + "max-cpu": { + "value": 15.0 + } + } + ] + } + } +} +``` + ## sampler, diversified_sampler If you're aggregating over millions of documents, you can use a `sampler` aggregation to reduce its scope to a small sample of documents for a faster response. The `sampler` aggregation selects the samples by top-scoring documents. @@ -552,7 +755,6 @@ The `range` aggregation lets you define the range for each bucket. For example, you can find the number of bytes between 1000 and 2000, 2000 and 3000, and 3000 and 4000. Within the `range` parameter, you can define ranges as objects of an array. - ```json GET opensearch_dashboards_sample_data_logs/_search { @@ -709,6 +911,7 @@ GET opensearch_dashboards_sample_data_logs/_search } } ``` + If you add a document with malformed fields to an index that has `ip_range` set to `false` in its mappings, OpenSearch rejects the entire document. You can set `ignore_malformed` to `true` to specify that OpenSearch should ignore malformed fields. The default is `false`. ```json @@ -722,6 +925,7 @@ If you add a document with malformed fields to an index that has `ip_range` set } } ``` + ## filter, filters A `filter` aggregation is a query clause, exactly like a search query — `match` or `term` or `range`. You can use the `filter` aggregation to narrow down the entire set of documents to a specific set before creating buckets. diff --git a/_query-dsl/aggregations/geohexgrid-agg.md b/_query-dsl/aggregations/geohexgrid-agg.md new file mode 100644 index 0000000000..a5831649cb --- /dev/null +++ b/_query-dsl/aggregations/geohexgrid-agg.md @@ -0,0 +1,377 @@ +--- +layout: default +title: GeoHex grid aggregations +parent: Aggregations +permalink: /aggregations/geohexgrid/ +nav_order: 4 +--- + +# GeoHex grid aggregations + +The Hexagonal Hierarchical Geospatial Indexing System (H3) partitions the Earth's areas into identifiable hexagon-shaped cells. + +The H3 grid system works well for proximity applications because it overcomes the limitations of Geohash's non-uniform partitions. Geohash encodes latitude and longitude pairs, leading to significantly smaller partitions near the poles and a degree of longitude near the equator. However, the H3 grid system's distortions are low and limited to 5 partitions of 122. These five partitions are placed in low-use areas (for example, in the middle of the ocean), leaving the essential areas error free. Thus, grouping documents based on the H3 grid system provides a better aggregation than the Geohash grid. + +The GeoHex grid aggregation groups [geopoints]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) into grid cells for geographical analysis. Each grid cell corresponds to an [H3 cell](https://h3geo.org/docs/core-library/h3Indexing/#h3-cell-indexp) and is identified using the [H3Index representation](https://h3geo.org/docs/core-library/h3Indexing/#h3index-representation). + +## Precision + +The `precision` parameter controls the level of granularity that determines the grid cell size. The lower the precision, the larger the grid cells. + +The following example illustrates low-precision and high-precision aggregation requests. + +To start, create an index and map the `location` field as a `geo_point`: + +```json +PUT national_parks +{ + "mappings": { + "properties": { + "location": { + "type": "geo_point" + } + } + } +} +``` + +Index the following documents into the sample index: + +```json +PUT national_parks/_doc/1 +{ + "name": "Yellowstone National Park", + "location": "44.42, -110.59" +} + +PUT national_parks/_doc/2 +{ + "name": "Yosemite National Park", + "location": "37.87, -119.53" +} + +PUT national_parks/_doc/3 +{ + "name": "Death Valley National Park", + "location": "36.53, -116.93" +} +``` + +You can index geopoints in several formats. For a list of all supported formats, see the [geopoint documentation]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats). +{: .note} + +## Low-precision requests + +Run a low-precision request that buckets all three documents together: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 1 + } + } + } +} +``` + +You can use either the `GET` or `POST` HTTP method for GeoHex grid aggregation queries. +{: .note} + +The response groups documents 2 and 3 together because they are close enough to be bucketed in one grid cell: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "national_parks", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Yellowstone National Park", + "location" : "44.42, -110.59" + } + }, + { + "_index" : "national_parks", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "name" : "Yosemite National Park", + "location" : "37.87, -119.53" + } + }, + { + "_index" : "national_parks", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "name" : "Death Valley National Park", + "location" : "36.53, -116.93" + } + } + ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "8129bffffffffff", + "doc_count" : 2 + }, + { + "key" : "8128bffffffffff", + "doc_count" : 1 + } + ] + } + } +} +``` + +## High-precision requests + +Now run a high-precision request: + +```json +GET national_parks/_search +{ + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 6 + } + } + } +} +``` + +All three documents are bucketed separately because of higher granularity: + +```json +{ + "took" : 5, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 1.0, + "hits" : [ + { + "_index" : "national_parks", + "_id" : "1", + "_score" : 1.0, + "_source" : { + "name" : "Yellowstone National Park", + "location" : "44.42, -110.59" + } + }, + { + "_index" : "national_parks", + "_id" : "2", + "_score" : 1.0, + "_source" : { + "name" : "Yosemite National Park", + "location" : "37.87, -119.53" + } + }, + { + "_index" : "national_parks", + "_id" : "3", + "_score" : 1.0, + "_source" : { + "name" : "Death Valley National Park", + "location" : "36.53, -116.93" + } + } + ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "8629ab6dfffffff", + "doc_count" : 1 + }, + { + "key" : "8629857a7ffffff", + "doc_count" : 1 + }, + { + "key" : "862896017ffffff", + "doc_count" : 1 + } + ] + } + } +} +``` + +## Filtering requests + +High-precision requests are resource intensive, so we recommend using a filter like `geo_bounding_box` to limit the geographical area. For example, the following query applies a filter to limit the search area: + +```json +GET national_parks/_search +{ + "size" : 0, + "aggregations": { + "filtered": { + "filter": { + "geo_bounding_box": { + "location": { + "top_left": "38, -120", + "bottom_right": "36, -116" + } + } + }, + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 6 + } + } + } + } + } +} +``` + +The response contains the two documents that are within the `geo_bounding_box` bounds: + +```json +{ + "took" : 4, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "filtered" : { + "doc_count" : 2, + "grouped" : { + "buckets" : [ + { + "key" : "8629ab6dfffffff", + "doc_count" : 1 + }, + { + "key" : "8629857a7ffffff", + "doc_count" : 1 + } + ] + } + } + } +} +``` + +You can also restrict the geographical area by providing the coordinates of the bounding envelope in the `bounds` parameter. Both `bounds` and `geo_bounding_box` coordinates can be specified in any of the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats). The following query uses the well-known text (WKT) "POINT(`longitude` `latitude`)" format for the `bounds` parameter: + +```json +GET national_parks/_search +{ + "size": 0, + "aggregations": { + "grouped": { + "geohex_grid": { + "field": "location", + "precision": 6, + "bounds": { + "top_left": "POINT (-120 38)", + "bottom_right": "POINT (-116 36)" + } + } + } + } +} +``` + +The response contains only the two results that are within the specified bounds: + +```json +{ + "took" : 3, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "grouped" : { + "buckets" : [ + { + "key" : "8629ab6dfffffff", + "doc_count" : 1 + }, + { + "key" : "8629857a7ffffff", + "doc_count" : 1 + } + ] + } + } +} +``` + +The `bounds` parameter can be used with or without the `geo_bounding_box` filter; these two parameters are independent and can have any spatial relationship to each other. + +## Supported parameters + +GeoHex grid aggregation requests support the following parameters. + +Parameter | Data type | Description +:--- | :--- | :--- +field | String | The field that contains the geopoints. This field must be mapped as a `geo_point` field. If the field contains an array, all array values are aggregated. Required. +precision | Integer | The zoom level used to determine grid cells for bucketing results. Valid values are in the [0, 15] range. Optional. Default is 5. +bounds | Object | The bounding box for filtering geopoints. The bounding box is defined by the top left and bottom right vertices. The vertices are specified as geopoints in one of the following formats:
- An object with a latitude and longitude
- An array in the [`longitude`, `latitude`] format
- A string in the "`latitude`,`longitude`" format
- A Geohash
- WKT
See the [geopoint formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point#formats) for formatting examples. Optional. +size | Integer | The maximum number of buckets to return. When there are more buckets than `size`, OpenSearch returns buckets with more documents. Optional. Default is 10,000. +shard_size | Integer | The maximum number of buckets to return from each shard. Optional. Default is max (10, `size` · number of shards), which provides a more accurate count of more highly prioritized buckets. \ No newline at end of file diff --git a/_opensearch/metric-agg.md b/_query-dsl/aggregations/metric-agg.md similarity index 99% rename from _opensearch/metric-agg.md rename to _query-dsl/aggregations/metric-agg.md index f8917a1873..bc46777658 100644 --- a/_opensearch/metric-agg.md +++ b/_query-dsl/aggregations/metric-agg.md @@ -1,9 +1,11 @@ --- layout: default -title: Metric Aggregations +title: Metric aggregations parent: Aggregations -nav_order: 1 -has_children: false +nav_order: 2 +permalink: /aggregations/metric-agg/ +redirect_from: + - /opensearch/metric-agg/ --- # Metric aggregations diff --git a/_opensearch/pipeline-agg.md b/_query-dsl/aggregations/pipeline-agg.md similarity index 99% rename from _opensearch/pipeline-agg.md rename to _query-dsl/aggregations/pipeline-agg.md index e939c96aa4..017ee7e2e8 100644 --- a/_opensearch/pipeline-agg.md +++ b/_query-dsl/aggregations/pipeline-agg.md @@ -1,8 +1,9 @@ --- layout: default -title: Pipeline Aggregations +title: Pipeline aggregations parent: Aggregations -nav_order: 4 +nav_order: 5 +permalink: /aggregations/pipeline-agg/ has_children: false --- diff --git a/_query-dsl/analyzers/language-analyzers.md b/_query-dsl/analyzers/language-analyzers.md new file mode 100644 index 0000000000..2883349590 --- /dev/null +++ b/_query-dsl/analyzers/language-analyzers.md @@ -0,0 +1,43 @@ +--- +layout: default +title: Language analyzers +nav_order: 45 +parent: Text analyzers +--- + +# Language analyzer + +OpenSearch supports the following language values with the `analyzer` option: +arabic, armenian, basque, bengali, brazilian, bulgarian, catalan, czech, danish, dutch, english, estonian, finnish, french, galician, german, greek, hindi, hungarian, indonesian, irish, italian, latvian, lithuanian, norwegian, persian, portuguese, romanian, russian, sorani, spanish, swedish, turkish, and thai. + +To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: + +```json + "analyzer": "french" + ``` + +#### Sample Request + +The following query maps an index with the language analyzer set to `french`: + +```json +PUT my-index-000001 + +{ + "mappings": { + "properties": { + "text": { + "type": "text", + "fields": { + "french": { + "type": "text", + "analyzer": "french" + } + } + } + } + } +} +``` + + \ No newline at end of file diff --git a/_im-plugin/refresh-analyzer/index.md b/_query-dsl/analyzers/refresh-analyzer.md similarity index 87% rename from _im-plugin/refresh-analyzer/index.md rename to _query-dsl/analyzers/refresh-analyzer.md index 641d34840f..01690b4654 100644 --- a/_im-plugin/refresh-analyzer/index.md +++ b/_query-dsl/analyzers/refresh-analyzer.md @@ -2,9 +2,11 @@ layout: default title: Refresh search analyzer nav_order: 50 -has_children: false -redirect_from: /im-plugin/refresh-analyzer/ +parent: Text analyzers has_toc: false +redirect_from: + - /im-plugin/refresh-analyzer/ + - /im-plugin/refresh-analyzer/index/ --- # Refresh search analyzer diff --git a/_opensearch/query-dsl/text-analyzers.md b/_query-dsl/analyzers/text-analyzers.md similarity index 73% rename from _opensearch/query-dsl/text-analyzers.md rename to _query-dsl/analyzers/text-analyzers.md index b618ee318a..0003740cc5 100644 --- a/_opensearch/query-dsl/text-analyzers.md +++ b/_query-dsl/analyzers/text-analyzers.md @@ -1,8 +1,11 @@ --- layout: default title: Text analyzers -parent: Query DSL -nav_order: 75 +nav_order: 190 +has_children: true +permalink: /analyzers/text-analyzers/ +redirect_from: + - /opensearch/query-dsl/text-analyzers/ --- @@ -52,7 +55,7 @@ Each analyzer consists of one tokenizer and zero or more token filters. Differen Option | Valid values | Description :--- | :--- | :--- -`analyzer` | `standard, simple, whitespace, stop, keyword, pattern, language, fingerprint` | The analyzer you want to use for the query. Different analyzers have different character filters, tokenizers, and token filters. The `stop` analyzer, for example, removes stop words (for example, "an," "but," "this") from the query string. For a full list of acceptable language values, see [Language analyzer](#language-analyzer) on this page. +`analyzer` | `standard, simple, whitespace, stop, keyword, pattern, language, fingerprint` | The analyzer you want to use for the query. Different analyzers have different character filters, tokenizers, and token filters. The `stop` analyzer, for example, removes stop words (for example, "an," "but," "this") from the query string. For a full list of acceptable language values, see [Language analyzer]({{site.url}}{{site.baseurl}}/query-dsl/analyzers/language-analyzers/) on this page. `quote_analyzer` | String | This option lets you choose to use the standard analyzer without any options, such as `language` or other analyzers. Usage is `"quote_analyzer": "standard"`. - -## Language analyzer - -OpenSearch supports the following language values with the `analyzer` option: -arabic, armenian, basque, bengali, brazilian, bulgarian, catalan, czech, danish, dutch, english, estonian, finnish, french, galician, german, greek, hindi, hungarian, indonesian, irish, italian, latvian, lithuanian, norwegian, persian, portuguese, romanian, russian, sorani, spanish, swedish, turkish, and thai. - -To use the analyzer when you map an index, specify the value within your query. For example, to map your index with the French language analyzer, specify the `french` value for the analyzer field: - -```json - "analyzer": "french" - ``` - -#### Sample Request - -The following query maps an index with the language analyzer set to `french`: - -```json -PUT my-index-000001 - -{ - "mappings": { - "properties": { - "text": { - "type": "text", - "fields": { - "french": { - "type": "text", - "analyzer": "french" - } - } - } - } - } -} -``` - - \ No newline at end of file diff --git a/_opensearch/query-dsl/compound/bool.md b/_query-dsl/query-dsl/compound/bool.md similarity index 73% rename from _opensearch/query-dsl/compound/bool.md rename to _query-dsl/query-dsl/compound/bool.md index 78669ea09d..897e1838b7 100644 --- a/_opensearch/query-dsl/compound/bool.md +++ b/_query-dsl/query-dsl/compound/bool.md @@ -4,24 +4,30 @@ title: Boolean queries parent: Compound queries grand_parent: Query DSL nav_order: 10 +permalink: /query-dsl/compound/bool/ +redirect_from: + - /opensearch/query-dsl/compound/bool/ --- # Boolean queries -The `bool` query lets you combine multiple search queries with boolean logic. You can use boolean logic between queries to either narrow or broaden your search results. +You can perform a Boolean query with the `bool` query type. A Boolean query compounds query clauses so you can combine multiple search queries with Boolean logic. To narrow or broaden your search results, use the `bool` query clause rules. -The `bool` query is a go-to query because it allows you to construct an advanced query by chaining together several simple ones. +As a compound query type, `bool` allows you to construct an advanced query by combining several simple queries. -Use the following clauses (subqueries) within the `bool` query: +Use the following rules to define how to combine multiple sub-query clauses within a `bool` query: -Clause | Behavior +Clause rule | Behavior :--- | :--- -`must` | The results must match the queries in this clause. If you have multiple queries, every single one must match. Acts as an `and` operator. -`must_not` | This is the anti-must clause. All matches are excluded from the results. Acts as a `not` operator. -`should` | The results should, but don't have to, match the queries. Each matching `should` clause increases the relevancy score. As an option, you can require one or more queries to match the value of the `minimum_number_should_match` parameter (default is 1). -`filter` | Filters reduce your dataset before applying the queries. A query within a filter clause is a yes-no option, where if a document matches the query it's included in the results. Otherwise, it's not. Filter queries do not affect the relevancy score that the results are sorted by. The results of a filter query are generally cached so they tend to run faster. Use the filter query to filter the results based on exact matches, ranges, dates, numbers, and so on. +`must` | Logical `and` operator. The results must match the queries in this clause. If you have multiple queries, all of them must match. +`must_not` | Logical `not` operator. All matches are excluded from the results. +`should` | Logical `or` operator. The results must match at least one of the queries, but, optionally, they can match more than one query. Each matching `should` clause increases the relevancy score. You can set the minimum number of queries that must match using the `minimum_number_should_match` parameter. +`minimum_number_should_match` | Optional parameter for use with a `should` query clause. Specifies the minimum number of queries that the document must match for it to be returned in the results. The default value is 1. +`filter` | Logical `and` operator that is applied first to reduce your dataset before applying the queries. A query within a filter clause is a yes or no option. If a document matches the query, it is returned in the results; otherwise, it is not. The results of a filter query are generally cached to allow for a faster return. Use the filter query to filter the results based on exact matches, ranges, dates, numbers, and so on. -The structure of a `bool` query is as follows: +### Boolean query structure + +The structure of a Boolean query contains the `bool` query type followed by clause rules, as follows: ```json GET _search @@ -201,7 +207,7 @@ OpenSearch returns a `matched_queries` array that lists the queries that matched If you remove the queries not in this list, you will still see the exact same result. By examining which `should` clause matched, you can better understand the relevancy score of the results. -You can also construct complex boolean expressions by nesting `bool` queries. +You can also construct complex Boolean expressions by nesting `bool` queries. For example, to find a `text_entry` field that matches (`love` OR `hate`) AND (`life` OR `grace`) in the play `Romeo and Juliet`: ```json diff --git a/_opensearch/query-dsl/compound/index.md b/_query-dsl/query-dsl/compound/index.md similarity index 93% rename from _opensearch/query-dsl/compound/index.md rename to _query-dsl/query-dsl/compound/index.md index 239af81d46..ca74c884e5 100644 --- a/_opensearch/query-dsl/compound/index.md +++ b/_query-dsl/query-dsl/compound/index.md @@ -4,6 +4,9 @@ title: Compound queries parent: Query DSL has_children: true nav_order: 40 +permalink: /query-dsl/compound/ +redirect_from: + - /opensearch/query-dsl/compound/index/ --- # Compound queries diff --git a/_opensearch/query-dsl/full-text/index.md b/_query-dsl/query-dsl/full-text/index.md similarity index 63% rename from _opensearch/query-dsl/full-text/index.md rename to _query-dsl/query-dsl/full-text/index.md index 9960414d57..1b396742f2 100644 --- a/_opensearch/query-dsl/full-text/index.md +++ b/_query-dsl/query-dsl/full-text/index.md @@ -4,27 +4,54 @@ title: Full-text queries parent: Query DSL has_children: true nav_order: 30 +permalink: /query-dsl/full-text/ +redirect_from: + - /opensearch/query-dsl/full-text/ + - /opensearch/query-dsl/full-text/index/ --- # Full-text queries -This page lists all full-text query types and common options. Given the sheer number of options and subtle behaviors, the best method of ensuring useful search results is to test different queries against representative indices and verify the output. +This page lists all full-text query types and common options. There are many optional fields that you can use to create subtle search behaviors, so we recommend that you test out some basic query types against representative indexes and verify the output before you perform more advanced or complex searches with multiple options. +OpenSearch uses the Apache Lucene search library, which provides highly efficient data structures and algorithms for ingesting, indexing, searching, and aggregating data. +To learn more about search query classes, see [Lucene query JavaDocs](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/Query.html). + +The full-text query types shown in this section use the standard analyzer, which analyzes text automatically when the query is submitted. + +You can also analyze fields when you index them. To learn more about how to convert unstructured text into structured text that is optimized for search, see [Optimizing text for searches with text analyzers]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/text-analyzers). +{: .note } + + --- #### Table of contents + 1. TOC {:toc} - --- -## Match +Common terms queries and the optional query field `cutoff_frequency` are now deprecated. +{: .note } + +## Query types + +OpenSearch Query DSL provides multiple query types that you can use in your searches. + +### Match +Use the `match` query for full-text search of a specific document field. The `match` query analyzes the provided search string and returns documents that match any of the string's terms. + +You can use Boolean query operators to combine searches. + + -The most basic form of the query provides only a field (`title`) and a term (`wind`): +The following example shows a basic `match` search for the `title` field set to the value `wind`: ```json GET _search @@ -51,7 +78,7 @@ curl --insecure -XGET -u 'admin:admin' https://://_search \ }' ``` -The query accepts the following options. For descriptions of each, see [Options](#options). +The query accepts the following options. For descriptions of each, see [Advanced filter options](#advanced-filter-options). ```json GET _search @@ -67,7 +94,6 @@ GET _search "analyzer": "standard", "zero_terms_query": "none", "lenient": false, - "cutoff_frequency": 0.01, "prefix_length": 0, "max_expansions": 50, "boost": 1 @@ -77,10 +103,9 @@ GET _search } ``` +### Multi-match -## Multi match - -Similar to [match](#match), but searches multiple fields. +You can use the `multi_match` query type to search multiple fields. Multi-match operation functions similarly to the [match](#match) operation. The `^` lets you "boost" certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. In the following example, a match for "wind" in the title field influences `_score` four times as much as a match in the plot field. The result is that films like *The Wind Rises* and *Gone with the Wind* are near the top of the search results, and films like *Twister* and *Sharknado*, which presumably have "wind" in their plot summaries, are near the bottom. @@ -96,7 +121,7 @@ GET _search } ``` -The query accepts the following options. For descriptions of each, see [Options](#options). +The query accepts the following options. For descriptions of each, see [Advanced filter options](#advanced-filter-options). ```json GET _search @@ -117,17 +142,15 @@ GET _search "prefix_length": 0, "max_expansions": 50, "auto_generate_synonyms_phrase_query": true, - "cutoff_frequency": 0.01, "zero_terms_query": "none" } } } ``` +### Match Boolean prefix -## Match boolean prefix - -Similar to [match](#match), but creates a [prefix query](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PrefixQuery.html) out of the last term in the query string. +The `match_bool_prefix` query analyzes the provided search string and creates a `bool` query from the string's terms. It uses every term except the last term as a whole word for matching. The last term is used as a prefix. The `match_bool_prefix` query returns documents that contain either the whole-word terms or terms that start with the prefix term, in any order. ```json GET _search @@ -140,7 +163,7 @@ GET _search } ``` -The query accepts the following options. For descriptions of each, see [Options](#options). +The query accepts the following options. For descriptions of each, see [Advanced filter options](#advanced-filter-options). ```json GET _search @@ -162,8 +185,11 @@ GET _search } ``` +For more reference information about prefix queries, see the [Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PrefixQuery.html). + +### Match phrase -## Match phrase +Use the `match_phrase` query to match documents that contain an exact phrase in a specified order. You can add flexibility to phrase matching by providing the `slop` parameter. Creates a [phrase query](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PhraseQuery.html) that matches a sequence of terms. @@ -178,7 +204,7 @@ GET _search } ``` -The query accepts the following options. For descriptions of each, see [Options](#options). +The query accepts the following options. For descriptions of each, see [Advanced filter options](#advanced-filter-options). ```json GET _search @@ -196,8 +222,9 @@ GET _search } ``` +### Match phrase prefix -## Match phrase prefix +Use the `match_phrase_prefix` query to specify a phrase to match in order. The documents that contain the phrase you specify will be returned. The last partial term in the phrase is interpreted as a prefix, so any documents that contain phrases that begin with the phrase and prefix of the last term will be returned. Similar to [match phrase](#match-phrase), but creates a [prefix query](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PrefixQuery.html) out of the last term in the query string. @@ -212,7 +239,7 @@ GET _search } ``` -The query accepts the following options. For descriptions of each, see [Options](#options). +The query accepts the following options. For descriptions of each, see [Advanced filter options](#advanced-filter-options). ```json GET _search @@ -229,8 +256,7 @@ GET _search } } ``` - - + +### Query string The query string query splits text based on operators and analyzes each individually. @@ -290,7 +315,7 @@ GET _search } ``` -The query accepts the following options. For descriptions of each, see [Options](#options). +The query accepts the following options. For descriptions of each, see [Advanced filter options](#advanced-filter-options). ```json GET _search @@ -323,10 +348,9 @@ GET _search } ``` +### Simple query string -## Simple query string - -The simple query string query is like the query string query, but it lets advanced users specify many arguments directly in the query string. The query discards any invalid portions of the query string. +Use the `simple_query_string` type to specify directly in the query string multiple arguments delineated by regular expressions. Searches with this type will discard any invalid portions of the string. ```json GET _search @@ -347,10 +371,10 @@ Special character | Behavior `*` | Acts as a wildcard. `""` | Wraps several terms into a phrase. `()` | Wraps a clause for precedence. -`~n` | When used after a term (e.g. `wnid~3`), sets `fuzziness`. When used after a phrase, sets `slop`. See [Options](#options). +`~n` | When used after a term (for example, `wnid~3`), sets `fuzziness`. When used after a phrase, sets `slop`. [Advanced filter options](#advanced-filter-options). `-` | Negates the term. -The query accepts the following options. For descriptions of each, see [Options](#options). +The query accepts the following options. For descriptions of each, see [Advanced filter options](#advanced-filter-options). ```json GET _search @@ -375,10 +399,9 @@ GET _search } ``` +### Match all -## Match all - -Matches all documents. Can be useful for testing. +The `match_all` query type will return all documents. This type can be useful in testing large document sets if you need to return the entire set. ```json GET _search @@ -389,7 +412,7 @@ GET _search } ``` - + +## Advanced filter options +You can filter your query results by using some of the optional query fields, such as wildcards, fuzzy query fields, and synonyms. You can also use analyzers as optional query fields. To learn more, see [How to use text analyzers]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/text-analyzers/#how-to-use-text-analyzers). -## Options +### Wildcard options Option | Valid values | Description :--- | :--- | :--- -`allow_leading_wildcard` | Boolean | Whether `*` and `?` are allowed as the first character of a search term. The default is true. +`allow_leading_wildcard` | Boolean | Whether `*` and `?` are allowed as the first character of a search term. The default is `true`. `analyze_wildcard` | Boolean | Whether OpenSearch should attempt to analyze wildcard terms. Some analyzers do a poor job at this task, so the default is false. -`analyzer` | `standard, simple, whitespace, stop, keyword, pattern, , fingerprint` | The analyzer you want to use for the query. Different analyzers have different character filters, tokenizers, and token filters. The `stop` analyzer, for example, removes stop words (e.g. "an," "but," "this") from the query string. -`auto_generate_synonyms_phrase_query` | Boolean | A value of true (default) automatically generates [phrase queries](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PhraseQuery.html) for multi-term synonyms. For example, if you have the synonym `"ba, batting average"` and search for "ba," OpenSearch searches for `ba OR "batting average"` (if this option is true) or `ba OR (batting AND average)` (if this option is false). + +### Fuzzy query options + +Option | Valid values | Description +:--- | :--- | :--- +`fuzziness` | `AUTO`, `0`, or a positive integer | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. +`fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to true (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. +`fuzzy_max_expansions` | Positive integer | Fuzzy queries "expand to" a number of matching terms that are within the distance specified in `fuzziness`. Then OpenSearch tries to match those terms against its indexes. + +### Synonyms in a multiple terms search + +You can also use synonyms with the `terms` query type to search for multiple terms. Use the `auto_generate_synonyms_phrase_query` Boolean field. By default it is set to `true`. It automatically generates phrase queries for multiple term synonyms. For example, if you have the synonym `"ba, batting average"` and search for "ba," OpenSearch searches for `ba OR "batting average"` when the option is `true` or `ba OR (batting AND average)` when the option is `false`. + +To learn more about the multiple terms query type, see [Terms]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/term/#terms). For more reference information about phrase queries, see the [Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PhraseQuery.html). + +### Other advanced options + +You can also use the following optional query fields to filter your query results. + +Option | Valid values | Description +:--- | :--- | :--- `boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. The default is 1.0. -`cutoff_frequency` | Between `0.0` and `1.0` or a positive integer | This value lets you define high and low frequency terms based on number of occurrences in the index. Numbers between 0 and 1 are treated as a percentage. For example, 0.10 is 10%. This value means that if a word occurs within the search field in more than 10% of the documents on the shard, OpenSearch considers the word "high frequency" and deemphasizes it when calculating search score.

Because this setting is *per shard*, testing its impact on search results can be challenging unless a cluster has many documents. `enable_position_increments` | Boolean | When true, result queries are aware of position increments. This setting is useful when the removal of stop words leaves an unwanted "gap" between terms. The default is true. `fields` | String array | The list of fields to search (e.g. `"fields": ["title^4", "description"]`). If unspecified, defaults to the `index.query.default_field` setting, which defaults to `["*"]`. -`flags` | String | A `|`-delimited string of [flags](#simple-query-string) to enable (e.g. `AND|OR|NOT`). The default is `ALL`. -`fuzziness` | `AUTO`, `0`, or a positive integer | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases. -`fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to true (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n").

If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases. +`flags` | String | A `|`-delimited string of [flags](#simple-query-string) to enable (e.g., `AND|OR|NOT`). The default is `ALL`. You can explicitly set the value for `default_field`. For example, to return all titles, set it to `"default_field": "title"`. `lenient` | Boolean | Setting `lenient` to true lets you ignore data type mismatches between the query and the document field. For example, a query string of "8.2" could match a field of type `float`. The default is false. -`low_freq_operator` | `and, or` | The operator for low-frequency terms. The default is `or`. See [Common terms](#common-terms) queries and `operator` in this table. +`low_freq_operator` | `and, or` | The operator for low-frequency terms. The default is `or`. See also `operator` in this table. `max_determinized_states` | Positive integer | The maximum number of "[states](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/util/automaton/Operations.html#DEFAULT_MAX_DETERMINIZED_STATES)" (a measure of complexity) that Lucene can create for query strings that contain regular expressions (e.g. `"query": "/wind.+?/"`). Larger numbers allow for queries that use more memory. The default is 10,000. -`max_expansions` | Positive integer | Fuzzy queries "expand to" a number of matching terms that are within the distance specified in `fuzziness`. Then OpenSearch tries to match those terms against its indices. `max_expansions` specifies the maximum number of terms that the fuzzy query expands to. The default is 50. -`minimum_should_match` | Positive or negative integer, positive or negative percentage, combination | If the query string contains multiple search terms and you used the `or` operator, the number of terms that need to match for the document to be considered a match. For example, if `minimum_should_match` is 2, "wind often rising" does not match "The Wind Rises." If `minimum_should_match` is 1, it matches. This option also has `low_freq` and `high_freq` properties for [Common terms](#common-terms) queries. +`max_expansions` | Positive integer | `max_expansions` specifies the maximum number of terms to which the query can expand. The default is 50. +`minimum_should_match` | Positive or negative integer, positive or negative percentage, combination | If the query string contains multiple search terms and you used the `or` operator, the number of terms that need to match for the document to be considered a match. For example, if `minimum_should_match` is 2, "wind often rising" does not match "The Wind Rises." If `minimum_should_match` is 1, it matches. `operator` | `or, and` | If the query string contains multiple search terms, whether all terms need to match (`and`) or only one term needs to match (`or`) for a document to be considered a match. `phrase_slop` | `0` (default) or a positive integer | See `slop`. `prefix_length` | `0` (default) or a positive integer | The number of leading characters that are not considered in fuzziness. @@ -431,6 +473,9 @@ Option | Valid values | Description `rewrite` | `constant_score, scoring_boolean, constant_score_boolean, top_terms_N, top_terms_boost_N, top_terms_blended_freqs_N` | Determines how OpenSearch rewrites and scores multi-term queries. The default is `constant_score`. `slop` | `0` (default) or a positive integer | Controls the degree to which words in a query can be misordered and still be considered a match. From the [Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/search/PhraseQuery.html#getSlop--): "The number of other words permitted between words in query phrase. For example, to switch the order of two words requires two moves (the first move places the words atop one another), so to permit re-orderings of phrases, the slop must be at least two. A value of zero requires an exact match." `tie_breaker` | `0.0` (default) to `1.0` | Changes the way OpenSearch scores searches. For example, a `type` of `best_fields` typically uses the highest score from any one field. If you specify a `tie_breaker` value between 0.0 and 1.0, the score changes to highest score + `tie_breaker` * score for all other matching fields. If you specify a value of 1.0, OpenSearch adds together the scores for all matching fields (effectively defeating the purpose of `best_fields`). -`time_zone` | UTC offset | The time zone to use (e.g. `-08:00`) if the query string contains a date range (e.g. `"query": "wind rises release_date[2012-01-01 TO 2014-01-01]"`). The default is `UTC`. +`time_zone` | UTC offset hours | Specifies the number of hours to offset the desired time zone from `UTC`. You need to indicate the time zone offset number if the query string contains a date range. For example, set `time_zone": "-08:00"` for a query with a date range such as `"query": "wind rises release_date[2012-01-01 TO 2014-01-01]"`). The default time zone format used to specify number of offset hours is `UTC`. `type` | `best_fields, most_fields, cross_fields, phrase, phrase_prefix` | Determines how OpenSearch executes the query and scores the results. The default is `best_fields`. `zero_terms_query` | `none, all` | If the analyzer removes all terms from a query string, whether to match no documents (default) or all documents. For example, the `stop` analyzer removes all terms from the string "an but this." + + diff --git a/_opensearch/query-dsl/full-text/query-string.md b/_query-dsl/query-dsl/full-text/query-string.md similarity index 98% rename from _opensearch/query-dsl/full-text/query-string.md rename to _query-dsl/query-dsl/full-text/query-string.md index 3688a2d239..258caa1416 100644 --- a/_opensearch/query-dsl/full-text/query-string.md +++ b/_query-dsl/query-dsl/full-text/query-string.md @@ -4,6 +4,9 @@ title: Query string queries parent: Full-text queries grand_parent: Query DSL nav_order: 25 +permalink: /query-dsl/full-text/query-string/ +redirect_from: + - /opensearch/query-dsl/full-text/query-string/ --- # Query string queries diff --git a/_opensearch/query-dsl/geo-and-xy/geo-bounding-box.md b/_query-dsl/query-dsl/geo-and-xy/geo-bounding-box.md similarity index 98% rename from _opensearch/query-dsl/geo-and-xy/geo-bounding-box.md rename to _query-dsl/query-dsl/geo-and-xy/geo-bounding-box.md index 7177334827..0dc63f3452 100644 --- a/_opensearch/query-dsl/geo-and-xy/geo-bounding-box.md +++ b/_query-dsl/query-dsl/geo-and-xy/geo-bounding-box.md @@ -4,6 +4,9 @@ title: Geo-bounding box queries parent: Geographic and xy queries grand_parent: Query DSL nav_order: 10 +permalink: /query-dsl/geo-and-xy/geo-bounding-box/ +redirect_from: + - /opensearch/query-dsl/geo-and-xy/geo-bounding-box/ --- # Geo-bounding box queries diff --git a/_opensearch/query-dsl/geo-and-xy/index.md b/_query-dsl/query-dsl/geo-and-xy/index.md similarity index 96% rename from _opensearch/query-dsl/geo-and-xy/index.md rename to _query-dsl/query-dsl/geo-and-xy/index.md index ba9f2b590e..7c2dadb4cb 100644 --- a/_opensearch/query-dsl/geo-and-xy/index.md +++ b/_query-dsl/query-dsl/geo-and-xy/index.md @@ -4,6 +4,9 @@ title: Geographic and xy queries parent: Query DSL has_children: true nav_order: 50 +permalink: /query-dsl/geo-and-xy/ +redirect_from: + - /opensearch/query-dsl/geo-and-xy/index/ --- # Geographic and xy queries diff --git a/_query-dsl/query-dsl/geo-and-xy/xy.md b/_query-dsl/query-dsl/geo-and-xy/xy.md new file mode 100644 index 0000000000..6b29063bf6 --- /dev/null +++ b/_query-dsl/query-dsl/geo-and-xy/xy.md @@ -0,0 +1,438 @@ +--- +layout: default +title: xy queries +parent: Geographic and xy queries +grand_parent: Query DSL +nav_order: 50 +permalink: /query-dsl/geo-and-xy/xy/ +redirect_from: + - /opensearch/query-dsl/geo-and-xy/xy/ +--- + +# xy queries + +To search for documents that contain [xy point]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy-point) and [xy shape]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy-shape) fields, use an xy query. + +## Spatial relations + +When you provide an xy shape to the xy query, the xy fields are matched using the following spatial relations to the provided shape. + +Relation | Description | Supporting xy Field Type +:--- | :--- | :--- +`INTERSECTS` | (Default) Matches documents whose xy point or xy shape intersects the shape provided in the query. | `xy_point`, `xy_shape` +`DISJOINT` | Matches documents whose xy shape does not intersect with the shape provided in the query. | `xy_shape` +`WITHIN` | Matches documents whose xy shape is completely within the shape provided in the query. | `xy_shape` +`CONTAINS` | Matches documents whose xy shape completely contains the shape provided in the query. | `xy_shape` + +The following examples illustrate searching for documents that contain xy shapes. To learn how to search for documents that contain xy points, see the [Querying xy points](#querying-xy-points) section. + +## Defining the shape in an xy query + +You can define the shape in an xy query either by providing a new shape definition at query time or by referencing the name of a shape pre-indexed in another index. + +### Using a new shape definition + +To provide a new shape to an xy query, define it in the `xy_shape` field. + +The following example illustrates searching for documents with xy shapes that match an xy shape defined at query time. + +First, create an index and map the `geometry` field as an `xy_shape`: + +```json +PUT testindex +{ + "mappings": { + "properties": { + "geometry": { + "type": "xy_shape" + } + } + } +} +``` + +Index a document with a point and a document with a polygon: + +```json +PUT testindex/_doc/1 +{ + "geometry": { + "type": "point", + "coordinates": [0.5, 3.0] + } +} + +PUT testindex/_doc/2 +{ + "geometry" : { + "type" : "polygon", + "coordinates" : [ + [[2.5, 6.0], + [0.5, 4.5], + [1.5, 2.0], + [3.5, 3.5], + [2.5, 6.0]] + ] + } +} +``` + +Define an [`envelope`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/xy-shape#envelope)—a bounding rectangle in the `[[minX, maxY], [maxX, minY]]` format. Search for documents with xy points or shapes that intersect that envelope: + +```json +GET testindex/_search +{ + "query": { + "xy_shape": { + "geometry": { + "shape": { + "type": "envelope", + "coordinates": [ [ 0.0, 6.0], [ 4.0, 2.0] ] + }, + "relation": "WITHIN" + } + } + } +} +``` + +The following image depicts the example. Both the point and the polygon are within the bounding envelope. + +xy shape query + + +The response contains both documents: + +```json +{ + "took" : 363, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.0, + "hits" : [ + { + "_index" : "testindex", + "_id" : "1", + "_score" : 0.0, + "_source" : { + "geometry" : { + "type" : "point", + "coordinates" : [ + 0.5, + 3.0 + ] + } + } + }, + { + "_index" : "testindex", + "_id" : "2", + "_score" : 0.0, + "_source" : { + "geometry" : { + "type" : "polygon", + "coordinates" : [ + [ + [ + 2.5, + 6.0 + ], + [ + 0.5, + 4.5 + ], + [ + 1.5, + 2.0 + ], + [ + 3.5, + 3.5 + ], + [ + 2.5, + 6.0 + ] + ] + ] + } + } + } + ] + } +} +``` + +### Using a pre-indexed shape definition + +When constructing an xy query, you can also reference the name of a shape pre-indexed in another index. Using this method, you can define an xy shape at index time and refer to it by name, providing the following parameters in the `indexed_shape` object. + +Parameter | Description +:--- | :--- +index | The name of the index that contains the pre-indexed shape. +id | The document ID of the document that contains the pre-indexed shape. +path | The field name of the field that contains the pre-indexed shape as a path. + +The following example illustrates referencing the name of a shape pre-indexed in another index. In this example, the index `pre-indexed-shapes` contains the shape that defines the boundaries, and the index `testindex` contains the shapes whose locations are checked against those boundaries. + +First, create an index `pre-indexed-shapes` and map the `geometry` field for this index as an `xy_shape`: + +```json +PUT pre-indexed-shapes +{ + "mappings": { + "properties": { + "geometry": { + "type": "xy_shape" + } + } + } +} +``` + +Index an envelope that specifies the boundaries and name it `rectangle`: + +```json +PUT pre-indexed-shapes/_doc/rectangle +{ + "geometry": { + "type": "envelope", + "coordinates" : [ [ 0.0, 6.0], [ 4.0, 2.0] ] + } +} +``` + +Index a document with a point and a document with a polygon into the index `testindex`: + +```json +PUT testindex/_doc/1 +{ + "geometry": { + "type": "point", + "coordinates": [0.5, 3.0] + } +} + +PUT testindex/_doc/2 +{ + "geometry" : { + "type" : "polygon", + "coordinates" : [ + [[2.5, 6.0], + [0.5, 4.5], + [1.5, 2.0], + [3.5, 3.5], + [2.5, 6.0]] + ] + } +} +``` + +Search for documents with shapes that intersect `rectangle` in the index `testindex` using a filter: + +```json +GET testindex/_search +{ + "query": { + "bool": { + "filter": { + "xy_shape": { + "geometry": { + "indexed_shape": { + "index": "pre-indexed-shapes", + "id": "rectangle", + "path": "geometry" + } + } + } + } + } + } +} +``` + +The preceding query uses the default spatial relation `INTERSECTS` and returns both the point and the polygon: + +```json +{ + "took" : 26, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.0, + "hits" : [ + { + "_index" : "testindex", + "_id" : "1", + "_score" : 0.0, + "_source" : { + "geometry" : { + "type" : "point", + "coordinates" : [ + 0.5, + 3.0 + ] + } + } + }, + { + "_index" : "testindex", + "_id" : "2", + "_score" : 0.0, + "_source" : { + "geometry" : { + "type" : "polygon", + "coordinates" : [ + [ + [ + 2.5, + 6.0 + ], + [ + 0.5, + 4.5 + ], + [ + 1.5, + 2.0 + ], + [ + 3.5, + 3.5 + ], + [ + 2.5, + 6.0 + ] + ] + ] + } + } + } + ] + } +} +``` + +## Querying xy points + +You can also use an xy query to search for documents that contain xy points. + +Create a mapping with `point` as `xy_point`: + +```json +PUT testindex1 +{ + "mappings": { + "properties": { + "point": { + "type": "xy_point" + } + } + } +} +``` + +Index three points: + +```json +PUT testindex1/_doc/1 +{ + "point": "1.0, 1.0" +} + +PUT testindex1/_doc/2 +{ + "point": "2.0, 0.0" +} + +PUT testindex1/_doc/3 +{ + "point": "-2.0, 2.0" +} +``` + +Search for points that lie within the circle with the center at (0, 0) and a radius of 2: + +```json +GET testindex1/_search +{ + "query": { + "xy_shape": { + "point": { + "shape": { + "type": "circle", + "coordinates": [0.0, 0.0], + "radius": 2 + } + } + } + } +} +``` + +xy point only supports the default `INTERSECTS` spatial relation, so you don't need to provide the `relation` parameter. +{: .note} + +The following image depicts the example. Points 1 and 2 are within the circle, and point 3 is outside the circle. + +xy point query + +The response returns documents 1 and 2: + +```json +{ + "took" : 575, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 2, + "relation" : "eq" + }, + "max_score" : 0.0, + "hits" : [ + { + "_index" : "testindex1", + "_id" : "1", + "_score" : 0.0, + "_source" : { + "point" : "1.0, 1.0" + } + }, + { + "_index" : "testindex1", + "_id" : "2", + "_score" : 0.0, + "_source" : { + "point" : "2.0, 0.0" + } + } + ] + } +} +``` \ No newline at end of file diff --git a/_opensearch/query-dsl/index.md b/_query-dsl/query-dsl/index.md similarity index 98% rename from _opensearch/query-dsl/index.md rename to _query-dsl/query-dsl/index.md index 6f7c277b24..520e2bd737 100644 --- a/_opensearch/query-dsl/index.md +++ b/_query-dsl/query-dsl/index.md @@ -1,10 +1,12 @@ --- layout: default title: Query DSL -nav_order: 27 +nav_order: 2 has_children: true +permalink: /query-dsl/ redirect_from: - /opensearch/query-dsl/ + - /opensearch/query-dsl/index/ - /docs/opensearch/query-dsl/ --- diff --git a/_opensearch/query-dsl/query-filter-context.md b/_query-dsl/query-dsl/query-filter-context.md similarity index 98% rename from _opensearch/query-dsl/query-filter-context.md rename to _query-dsl/query-dsl/query-filter-context.md index 53f716c234..05996bfd8c 100644 --- a/_opensearch/query-dsl/query-filter-context.md +++ b/_query-dsl/query-dsl/query-filter-context.md @@ -2,6 +2,7 @@ layout: default title: Query and filter context parent: Query DSL +permalink: /query-dsl/query-filter-context/ nav_order: 5 --- diff --git a/_opensearch/query-dsl/span-query.md b/_query-dsl/query-dsl/span-query.md similarity index 94% rename from _opensearch/query-dsl/span-query.md rename to _query-dsl/query-dsl/span-query.md index 6ed2842991..912505843b 100644 --- a/_opensearch/query-dsl/span-query.md +++ b/_query-dsl/query-dsl/span-query.md @@ -3,6 +3,9 @@ layout: default title: Span queries parent: Query DSL nav_order: 60 +permalink: /query-dsl/span-query/ +redirect_from: + - /opensearch/query-dsl/span-query/ --- # Span queries diff --git a/_opensearch/query-dsl/term-vs-full-text.md b/_query-dsl/query-dsl/term-vs-full-text.md similarity index 99% rename from _opensearch/query-dsl/term-vs-full-text.md rename to _query-dsl/query-dsl/term-vs-full-text.md index c35fa77bd0..68a912b541 100644 --- a/_opensearch/query-dsl/term-vs-full-text.md +++ b/_query-dsl/query-dsl/term-vs-full-text.md @@ -2,6 +2,7 @@ layout: default title: Term-level and full-text queries compared parent: Query DSL +permalink: /query-dsl/term-vs-full-text/ nav_order: 10 --- diff --git a/_opensearch/query-dsl/term.md b/_query-dsl/query-dsl/term.md similarity index 95% rename from _opensearch/query-dsl/term.md rename to _query-dsl/query-dsl/term.md index ffe33cd3cd..38a43f9709 100644 --- a/_opensearch/query-dsl/term.md +++ b/_query-dsl/query-dsl/term.md @@ -3,6 +3,9 @@ layout: default title: Term-level queries parent: Query DSL nav_order: 20 +permalink: /query-dsl/term/ +redirect_from: + - /opensearch/query-dsl/term/ --- # Term-level queries @@ -226,7 +229,7 @@ GET shakespeare/_search ## Range -Use the `range` query to search for a range of values in a field. +You can search for a range of values in a field with the `range` query. To search for documents where the `line_id` value is >= 10 and <= 20: @@ -252,6 +255,9 @@ Parameter | Behavior `lte` | Less than or equal to. `lt` | Less than. +In addition to the range query parameters, you can provide date formats or relation operators such as "contains" or "within." To see the supported field types for range queries, see [Range query optional parameters]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/range/#range-query). To see all date formats, see [Formats]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/#formats). +{: .tip } + Assume that you have a `products` index and you want to find all the products that were added in the year 2019: ```json diff --git a/_search-plugins/async/security.md b/_search-plugins/async/security.md index 198d9c9ab2..c7cd058cbe 100644 --- a/_search-plugins/async/security.md +++ b/_search-plugins/async/security.md @@ -10,7 +10,7 @@ has_children: false You can use the security plugin with asynchronous searches to limit non-admin users to specific actions. For example, you might want some users to only be able to submit or delete asynchronous searches, while you might want others to only view the results. -All asynchronous search indices are protected as system indices. Only a super admin user or an admin user with a Transport Layer Security (TLS) certificate can access system indices. For more information, see [System indexes]({{site.url}}{{site.baseurl}}/security/configuration/system-indexes/). +All asynchronous search indices are protected as system indices. Only a super admin user or an admin user with a Transport Layer Security (TLS) certificate can access system indices. For more information, see [System indices]({{site.url}}{{site.baseurl}}/security/configuration/system-indices/). ## Basic permissions diff --git a/_search-plugins/knn/api.md b/_search-plugins/knn/api.md index ff667c480a..46c20d3b04 100644 --- a/_search-plugins/knn/api.md +++ b/_search-plugins/knn/api.md @@ -1,7 +1,7 @@ --- layout: default title: API -nav_order: 5 +nav_order: 30 parent: k-NN has_children: false --- @@ -331,7 +331,7 @@ POST /_plugins/_knn/models/{model_id}/_train?preference={node_id} "engine":"faiss", "space_type": "l2", "parameters":{ - "nlists":128, + "nlist":128, "encoder":{ "name":"pq", "parameters":{ @@ -361,7 +361,7 @@ POST /_plugins/_knn/models/_train?preference={node_id} "engine":"faiss", "space_type": "l2", "parameters":{ - "nlists":128, + "nlist":128, "encoder":{ "name":"pq", "parameters":{ diff --git a/_search-plugins/knn/approximate-knn.md b/_search-plugins/knn/approximate-knn.md index 722e74a5a2..913d7a956d 100644 --- a/_search-plugins/knn/approximate-knn.md +++ b/_search-plugins/knn/approximate-knn.md @@ -1,7 +1,7 @@ --- layout: default title: Approximate search -nav_order: 2 +nav_order: 10 parent: k-NN has_children: false has_math: true @@ -9,23 +9,34 @@ has_math: true # Approximate k-NN search -The approximate k-NN search method uses nearest neighbor algorithms from *nmslib* and *faiss* to power -k-NN search. To see the algorithms that the plugin currently supports, check out the [k-NN Index documentation]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). -In this case, approximate means that for a given search, the neighbors returned are an estimate of the true k-nearest neighbors. Of the three search methods the plugin provides, this method offers the best search scalability for large data sets. Generally speaking, once the data set gets into the hundreds of thousands of vectors, this approach is preferred. +Standard k-NN search methods compute similarity using a brute-force approach that measures the nearest distance between a query and a number of points, which produces exact results. This works well in many applications. However, in the case of extremely large datasets with high dimensionality, this creates a scaling problem that reduces the efficiency of the search. Approximate k-NN search methods can overcome this by employing tools that restructure indexes more efficiently and reduce the dimensionality of searchable vectors. Using this approach requires a sacrifice in accuracy but increases search processing speeds appreciably. -The k-NN plugin builds a native library index of the vectors for each "knn-vector field"/ "Lucene segment" pair during indexing that can be used to efficiently find the k-nearest neighbors to a query vector during search. To learn more about Lucene segments, see the [Apache Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/codecs/lucene87/package-summary.html#package.description). -These native library indices are loaded into native memory during search and managed by a cache. To learn more about -pre-loading native library indices into memory, refer to the [warmup API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#warmup-operation). Additionally, you can see what native library indices are already loaded in memory, which you can learn more about in the [stats API section]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#stats). +The Approximate k-NN search methods leveraged by OpenSearch use approximate nearest neighbor (ANN) algorithms from the [nmslib](https://github.com/nmslib/nmslib), [faiss](https://github.com/facebookresearch/faiss), and [Lucene](https://lucene.apache.org/) libraries to power k-NN search. These search methods employ ANN to improve search latency for large datasets. Of the three search methods the k-NN plugin provides, this method offers the best search scalability for large datasets. This approach is the preferred method when a dataset reaches hundreds of thousands of vectors. -Because the native library indices are constructed during indexing, it is not possible to apply a filter on an index +For details on the algorithms the plugin currently supports, see [k-NN Index documentation]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#method-definitions). +{: .note} + +The k-NN plugin builds a native library index of the vectors for each knn-vector field/Lucene segment pair during indexing, which can be used to efficiently find the k-nearest neighbors to a query vector during search. To learn more about Lucene segments, see the [Apache Lucene documentation](https://lucene.apache.org/core/8_9_0/core/org/apache/lucene/codecs/lucene87/package-summary.html#package.description). These native library indexes are loaded into native memory during search and managed by a cache. To learn more about preloading native library indexes into memory, refer to the [warmup API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#warmup-operation). Additionally, you can see which native library indexes are already loaded in memory. To learn more about this, see the [stats API section]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#stats). + +Because the native library indexes are constructed during indexing, it is not possible to apply a filter on an index and then use this search method. All filters are applied on the results produced by the approximate nearest neighbor search. +### Recommendations for engines and cluster node sizing + +Each of the three engines used for approximate k-NN search has its own attributes that make one more sensible to use than the others in a given situation. You can follow the general information below to help determine which engine will best meet your requirements. + +* The faiss engine performs exceptionally well (on orders of magnitude) with hardware that includes a GPU. When cost is not the first concern, this is the recommended engine. +* When only a CPU is available, nmslib is a good choice. In general, it outperforms both faiss and Lucene. +* For relatively smaller datasets (up to a few million vectors), the Lucene engine demonstrates better latencies and recall. At the same time, the size of the index is smallest compared to the other engines, which allows it to use smaller AWS instances for data nodes.
Also, the Lucene engine uses pure Java implementation and does not share any of the limitations that engines using platform-native code experience. However, one exception to this is that the maximum number of vector dimensions for the Lucene engine is 1024, compared with 10000 for the other engines. Refer to the sample mapping parameters in the following section to see where this is configured. + +When considering cluster node sizing, a general approach is to first establish an even distribution of the index across the cluster. However, there are other considerations. To help make these choices, you can refer to the OpenSearch managed service guidance in the section [Sizing domains](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/sizing-domains.html). + ## Get started with approximate k-NN -To use the k-NN plugin's approximate search functionality, you must first create a k-NN index with setting `index.knn` to `true`. This setting tells the plugin to create native library indices for the index. +To use the k-NN plugin's approximate search functionality, you must first create a k-NN index with `index.knn` set to `true`. This setting tells the plugin to create native library indexes for the index. Next, you must add one or more fields of the `knn_vector` data type. This example creates an index with two -`knn_vector`'s, one using *faiss*, the other using *nmslib*, fields: +`knn_vector` fields, one using `faiss` and the other using `nmslib` fields: ```json PUT my-knn-index-1 @@ -69,12 +80,11 @@ PUT my-knn-index-1 } ``` -In the example above, both `knn_vector`s are configured from method definitions. Additionally, `knn_vector`s can also be configured from models. Learn more about it [here]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#knn_vector-data-type)! +In the example above, both `knn_vector` fields are configured from method definitions. Additionally, `knn_vector` fields can also be configured from models. You can learn more about this in the [knn_vector data type]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index#knn_vector-data-type) section. -The `knn_vector` data type supports a vector of floats that can have a dimension of up to 10,000, as set by the -dimension mapping parameter. +The `knn_vector` data type supports a vector of floats that can have a dimension of up to 10000 for the nmslib and faiss engines, as set by the dimension mapping parameter. The maximum dimension for the Lucene library is 1024. -In OpenSearch, codecs handle the storage and retrieval of indices. The k-NN plugin uses a custom codec to write vector data to native library indices so that the underlying k-NN search library can read it. +In OpenSearch, codecs handle the storage and retrieval of indexes. The k-NN plugin uses a custom codec to write vector data to native library indexes so that the underlying k-NN search library can read it. {: .tip } After you create the index, you can add some data to it: @@ -133,24 +143,24 @@ any `knn_vector` field that has a dimension matching the dimension of the model ```json PUT /train-index { - "settings" : { - "number_of_shards" : 3, - "number_of_replicas" : 0 + "settings": { + "number_of_shards": 3, + "number_of_replicas": 0 }, "mappings": { - "properties": { - "train-field": { - "type": "knn_vector", - "dimension": 4 + "properties": { + "train-field": { + "type": "knn_vector", + "dimension": 4 } - } + } } } ``` -Notice that `index.knn` is not set in the index settings. This ensures that we do not create native library indices for this index. +Notice that `index.knn` is not set in the index settings. This ensures that you do not create native library indexes for this index. -Next, let's add some data to it: +You can now add some data to the index: ```json POST _bulk @@ -176,17 +186,17 @@ POST /_plugins/_knn/models/my-model/_train "description": "My models description", "search_size": 500, "method": { - "name":"hnsw", - "engine":"faiss", - "parameters":{ - "encoder":{ - "name":"pq", - "parameters":{ - "code_size": 8, - "m": 8 - } + "name": "hnsw", + "engine": "faiss", + "parameters": { + "encoder": { + "name": "pq", + "parameters": { + "code_size": 8, + "m": 8 } } + } } } ``` @@ -200,24 +210,24 @@ GET /_plugins/_knn/models/my-model?filter_path=state&pretty } ``` -Once the model enters the "created" state, we can create an index that will use this model to initialize it's native -library indices: +Once the model enters the "created" state, you can create an index that will use this model to initialize its native +library indexes: ```json PUT /target-index { - "settings" : { - "number_of_shards" : 3, - "number_of_replicas" : 1, + "settings": { + "number_of_shards": 3, + "number_of_replicas": 1, "index.knn": true }, "mappings": { - "properties": { - "target-field": { - "type": "knn_vector", - "model_id": "my-model" + "properties": { + "target-field": { + "type": "knn_vector", + "model_id": "my-model" } - } + } } } ``` @@ -295,11 +305,11 @@ A space corresponds to the function used to measure the distance between two poi cosinesimil \[ d(\mathbf{x}, \mathbf{y}) = 1 - cos { \theta } = 1 - {\mathbf{x} · \mathbf{y} \over \|\mathbf{x}\| · \|\mathbf{y}\|}\]\[ = 1 - {\sum_{i=1}^n x_i y_i \over \sqrt{\sum_{i=1}^n x_i^2} · \sqrt{\sum_{i=1}^n y_i^2}}\] - where \(\|\mathbf{x}\|\) and \(\|\mathbf{y}\|\) represent normalized vectors. - \[ score = {1 \over 1 + d } \] + where \(\|\mathbf{x}\|\) and \(\|\mathbf{y}\|\) represent the norms of vectors x and y respectively. + nmslib and faiss:\[ score = {1 \over 1 + d } \]
Lucene:\[ score = {1 + d \over 2}\] - innerproduct + innerproduct (not supported for Lucene) \[ d(\mathbf{x}, \mathbf{y}) = - {\mathbf{x} · \mathbf{y}} = - \sum_{i=1}^n x_i y_i \] \[ \text{If} d \ge 0, \] \[score = {1 \over 1 + d }\] \[\text{If} d < 0, score = −d + 1\] diff --git a/_search-plugins/knn/filter-search-knn.md b/_search-plugins/knn/filter-search-knn.md new file mode 100644 index 0000000000..6e02b610f1 --- /dev/null +++ b/_search-plugins/knn/filter-search-knn.md @@ -0,0 +1,649 @@ +--- +layout: default +title: Search with k-NN filters +nav_order: 15 +parent: k-NN +has_children: false +has_math: true +--- + +# Search with k-NN filters +Introduced 2.4 +{: .label .label-purple } + +You can create custom filters using Query domain-specific language (DSL) search options to refine your k-NN searches. You define the filter criteria within the `knn_vector` field's `filter` subsection in your query. You can use any of the OpenSearch query DSL query types as a filter. This includes the common query types: `term`, `range`, `regexp`, and `wildcard`, as well as custom query types. To include or exclude results, use Boolean query clauses. You can also specify a query point with the `knn_vector` type and search for nearest neighbors that match your filter criteria. +To run k-NN queries with a filter, the Lucene search engine and Hierarchical Navigable Small World (HNSW) method are required. + +To learn more about how to use query DSL Boolean query clauses, see [Boolean queries]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/compound/bool). For more details about the `knn_vector` data type definition, see [k-NN Index]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/). +{: .note } + +## How does a k-NN filter work? + +The OpenSearch k-NN plugin version 2.2 introduced support for the Lucene engine in order to process k-NN searches. The Lucene engine provides a search that is based on the HNSW algorithm in order to represent a multi-layered graph. The OpenSearch k-NN plugin version 2.4 can incorporate filters for searches based on Lucene 9.4. + +After a filter is applied to a set of documents to be searched, the algorithm decides whether to perform pre-filtering for an exact k-NN search or modified post-filtering for an approximate search. The approximate search with filtering ensures the top number of closest vectors in the results. + +Lucene also provides the capability to operate its `KnnVectorQuery` across a subset of documents. To learn more about this capability, see the [Apache Lucene Documentation](https://issues.apache.org/jira/browse/LUCENE-10382). + +To learn more about all available k-NN search approaches, including approximate k-NN, exact k-NN with script score, and pre-filtering with painless extensions, see [k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/). + +### Filtered search performance + +Filtering that is tightly integrated with the Lucene HNSW algorithm implementation allows you to apply k-NN searches more efficiently, both in terms of relevancy of search results and performance. Consider, for example, an exact search using post-filtering on a large dataset that returns results slowly and does not ensure the required number of results specified by `k`. +With this new capability, you can create an approximate k-NN search, apply filters, and get the number of results that you need. To learn more about approximate searches, see [Approximate k-NN search]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/). + +The HNSW algorithm decides which type of filtering to apply to a search based on the volume of documents and number of `k` points in the index that you search with a filter. + +![How the algorithm evaluates a doc set]({{site.url}}{{site.baseurl}}/images/hsnw-algorithm.png) + +| Variable | Description | +-- | -- | -- | +N | The number of documents in the index. +P | The number of documents in the search set after the filter is applied using the formula P <= N. +q | The search vector. +k | The maximum number of vectors to return in the response. + +To learn more about k-NN performance tuning, see [Performance tuning]({{site.url}}{{site.baseurl}}/search-plugins/knn/performance-tuning/). + +## Filter approaches by use case + +Depending on the dataset that you are searching, you might choose a different approach to minimize recall or latency. You can create filters that are: + +* Very restrictive: Returns the lowest number of documents (for example, 2.5%). +* Somewhat restrictive: Returns some documents (for example, 38%). +* Not very restrictive: Returns the highest number of documents (for example, 80%). + +The restrictive percentage indicates the number of documents the filter returns for any given document set in an index. + +Number of Vectors | Filter Restrictive Percentage | k | Recall | Latency +-- | -- | -- | -- | -- +10M | 2.5 | 100 | Scoring script | Scoring script +10M | 38 | 100 | Lucene filter | Boolean filter +10M | 80 | 100 | Scoring script | Lucene filter +1M | 2.5 | 100 | Lucene filter | Scoring script +1M | 38 | 100 | Lucene filter | lucene_filtering / Scoring script +1M | 80 | 100 | Boolean filter | lucene_filtering + +In this context, *Scoring script* is essentially a brute force search, whereas a Boolean filter is an approximate k-NN search with post-filtering. + +To learn more about the dynamic searches you can perform with the score script plugin, see [Exact k-NN with scoring script]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-score-script/). + +### Boolean filter with approximate k-NN search + +In a Boolean query that uses post-filtering, you can join a k-NN query with a filter using a `bool` `must` query clause. + +#### Example request + +The following k-NN query uses a Boolean query clause to filter results: + +```json +POST /hotels-index/_search +{ + "size": 3, + "query": { + "bool": { + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "gte": 8, + "lte": 10 + } + } + }, + { + "term": { + "parking": "true" + } + } + ] + } + }, + "must": [ + { + "knn": { + "location": { + "vector": [ + 5.0, + 4.0 + ], + "k": 20 + } + } + } + ] + } + } +} +``` +#### Example response + +The Boolean query filter returns the following results in the response: + +```json +{ + "took" : 95, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 5, + "relation" : "eq" + }, + "max_score" : 0.72992706, + "hits" : [ + { + "_index" : "hotels-index", + "_id" : "3", + "_score" : 0.72992706, + "_source" : { + "location" : [ + 4.9, + 3.4 + ], + "parking" : "true", + "rating" : 9 + } + }, + { + "_index" : "hotels-index", + "_id" : "6", + "_score" : 0.3012048, + "_source" : { + "location" : [ + 6.4, + 3.4 + ], + "parking" : "true", + "rating" : 9 + } + }, + { + "_index" : "hotels-index", + "_id" : "5", + "_score" : 0.24154587, + "_source" : { + "location" : [ + 3.3, + 4.5 + ], + "parking" : "true", + "rating" : 8 + } + } + ] + } +} +``` + +### Use case 1: Very restrictive 2.5% filter + +A very restrictive filter returns the lowest number of documents in your dataset. For example, the following filter criteria specifies hotels with feedback ratings less than or equal to 3. This 2.5% filter only returns 1 document: + +```json + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "lte": 3 + } + } + } + ] + } + } +``` + +### Use case 2: Somewhat restrictive 38% filter + +A somewhat restrictive filter returns 38% of the documents in the data set that you search. For example, the following filter criteria specifies hotels with parking and feedback ratings less than or equal to 8 and returns 5 documents: + +```json + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "lte": 8 + } + } + }, + { + "term": { + "parking": "true" + } + } + ] + } + } +``` + +### Use case 3: Not very restrictive 80% filter + +A filter that is not very restrictive will return 80% of the documents that you search. For example, the following filter criteria specifies hotels with feedback ratings greater than or equal to 5 and returns 10 documents: + +```json + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "gte": 5 + } + } + } + ] + } + } +``` + +## Overview: How to use filters in a k-NN search + +You can search with a filter by following these three steps: +1. Create an index and specify the requirements for the Lucene engine and HNSW requirements in the mapping. +1. Add your data to the index. +1. Search the index and specify these three items in your query: +* One or more filters defined by query DSL +* A vector reference point defined by the `vector` field +* The number of matches you want returned with the `k` field + +You can use a range query to specify hotel feedback ratings and a term query to require that parking is available. The criteria is processed with Boolean clauses to indicate whether or not the document contains the criteria. + +Consider a dataset that contains 12 documents, a search reference point, and documents that meet two filter criteria. + +![Graph of documents with filter criteria]({{site.url}}{{site.baseurl}}/images/knn-two-filters.png) + +## Step 1: Create a new index with a Lucene mapping + +Before you can run a k-NN search with a filter, you need to create an index, specify the Lucene engine in a mapping, and add data to the index. + +You need to add a `location` field to represent the location and specify it as the `knn_vector` type. The most basic vector can be two-dimensional. For example: + +``` + "type": "knn_vector", + "dimension": 2, +``` + +### Requirement: Lucene engine with HNSW method + +Make sure to specify "hnsw" method and "lucene" engine in the `knn_vector` field description, as follows: + +```json +"my_field": { + "type": "knn_vector", + "dimension": 2, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene" + } + } +``` + +#### Example request + +The following request creates a new index called "hotels-index": + +```json +PUT /hotels-index +{ + "settings": { + "index": { + "knn": true, + "knn.algo_param.ef_search": 100, + "number_of_shards": 1, + "number_of_replicas": 0 + } + }, + "mappings": { + "properties": { + "location": { + "type": "knn_vector", + "dimension": 2, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene", + "parameters": { + "ef_construction": 100, + "m": 16 + } + } + } + } + } +} +``` +#### Example response + +Upon success, you should receive a "200-OK" status with the following response: + +```json +{ + "acknowledged" : true, + "shards_acknowledged" : true, + "index" : "hotels-index" +} +``` + +## Step 2: Add data to your index + +Next, add data to your index with a PUT HTTP request. Make sure that the search criteria is defined in the body of the request. + +#### Example request + +The following request adds 12 hotel documents that contain criteria such as feedback ratings and whether or not parking is available: + +```json +POST /_bulk +{ "index": { "_index": "hotels-index", "_id": "1" } } +{ "location": [5.2, 4.4], "parking" : "true", "rating" : 5 } +{ "index": { "_index": "hotels-index", "_id": "2" } } +{ "location": [5.2, 3.9], "parking" : "false", "rating" : 4 } +{ "index": { "_index": "hotels-index", "_id": "3" } } +{ "location": [4.9, 3.4], "parking" : "true", "rating" : 9 } +{ "index": { "_index": "hotels-index", "_id": "4" } } +{ "location": [4.2, 4.6], "parking" : "false", "rating" : 6} +{ "index": { "_index": "hotels-index", "_id": "5" } } +{ "location": [3.3, 4.5], "parking" : "true", "rating" : 8 } +{ "index": { "_index": "hotels-index", "_id": "6" } } +{ "location": [6.4, 3.4], "parking" : "true", "rating" : 9 } +{ "index": { "_index": "hotels-index", "_id": "7" } } +{ "location": [4.2, 6.2], "parking" : "true", "rating" : 5 } +{ "index": { "_index": "hotels-index", "_id": "8" } } +{ "location": [2.4, 4.0], "parking" : "true", "rating" : 8 } +{ "index": { "_index": "hotels-index", "_id": "9" } } +{ "location": [1.4, 3.2], "parking" : "false", "rating" : 5 } +{ "index": { "_index": "hotels-index", "_id": "10" } } +{ "location": [7.0, 9.9], "parking" : "true", "rating" : 9 } +{ "index": { "_index": "hotels-index", "_id": "11" } } +{ "location": [3.0, 2.3], "parking" : "false", "rating" : 6 } +{ "index": { "_index": "hotels-index", "_id": "12" } } +{ "location": [5.0, 1.0], "parking" : "true", "rating" : 3 } +``` + +#### Example response + +Upon success, you should receive a "200-OK" status with entries for each document ID added to the index. The following response is truncated to only show one document: + +```json +{ + "took" : 140, + "errors" : false, + "items" : [ + { + "index" : { + "_index" : "hotels-index", + "_id" : "1", + "_version" : 2, + "result" : "updated", + "_shards" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "_seq_no" : 12, + "_primary_term" : 3, + "status" : 200 + } + } + ] +} + +``` + +## Step 3: Search your data with a filter + +Now you can create a k-NN search that specifies filters by using query DSL Boolean clauses. You need to include your reference point to search for nearest neighbors. Provide an x-y coordinate for the point within the `vector` field, such as `"vector": [ 5.0, 4.0]`. + + To learn more about how to specify ranges with query DSL, see [Range query]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/term/#range). +{: .note } + +#### Example request + +The following request creates a k-NN query that only returns the top hotels rated between 8 and 10 and that provide parking. The filter criteria to indicate the range for the feedback ratings uses a `range` query and a `term` query clause to indicate "parking": + +```json +POST /hotels-index/_search +{ + "size": 3, + "query": { + "knn": { + "location": { + "vector": [ + 5.0, + 4.0 + ], + "k": 3, + "filter": { + "bool": { + "must": [ + { + "range": { + "rating": { + "gte": 8, + "lte": 10 + } + } + }, + { + "term": { + "parking": "true" + } + } + ] + } + } + } + } + } +} +``` + + +#### Sample Response + +The following response indicates that only three hotels met the filter criteria: + + +```json +{ + "took" : 47, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 0.72992706, + "hits" : [ + { + "_index" : "hotels-index", + "_id" : "3", + "_score" : 0.72992706, + "_source" : { + "location" : [ + 4.9, + 3.4 + ], + "parking" : "true", + "rating" : 9 + } + }, + { + "_index" : "hotels-index", + "_id" : "6", + "_score" : 0.3012048, + "_source" : { + "location" : [ + 6.4, + 3.4 + ], + "parking" : "true", + "rating" : 9 + } + }, + { + "_index" : "hotels-index", + "_id" : "5", + "_score" : 0.24154587, + "_source" : { + "location" : [ + 3.3, + 4.5 + ], + "parking" : "true", + "rating" : 8 + } + } + ] + } +} + +``` + +## Additional complex filter query + +Depending on how restrictive you want your filter to be, you can add multiple query types to a single request, such as `term`, `wildcard`, `regexp`, or `range`. You can then filter out the search results with the Boolean clauses `must`, `should`, and `must_not`. + +#### Example request + +The following request returns hotels that provide parking. This request illustrates multiple alternative mechanisms to obtain the parking filter criteria. It uses a regular expression for the value `true`, a term query for the key-value pair `"parking":"true"`, a wildcard for the characters that spell "true", and the `must_not` clause to eliminate hotels with "parking" set to `false`: + +```json +POST /hotels-index/_search +{ + "size": 3, + "query": { + "knn": { + "location": { + "vector": [ + 5.0, + 4.0 + ], + "k": 3, + "filter": { + "bool": { + "must": { + "range": { + "rating": { + "gte": 1, + "lte": 6 + } + } + }, + "should": [ + { + "term": { + "parking": "true" + } + }, + { + "wildcard": { + "parking": { + "value": "t*e" + } + } + }, + { + "regexp": { + "parking": "[a-zA-Z]rue" + } + } + ], + "must_not": [ + { + "term": { + "parking": "false" + } + } + ], + "minimum_should_match": 1 + } + } + } + } + } +} +``` +#### Example response + +The following response indicates a few results for the search with filters: + +```json +{ + "took" : 94, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 3, + "relation" : "eq" + }, + "max_score" : 0.8333333, + "hits" : [ + { + "_index" : "hotels-index", + "_id" : "1", + "_score" : 0.8333333, + "_source" : { + "location" : [ + 5.2, + 4.4 + ], + "parking" : "true", + "rating" : 5 + } + }, + { + "_index" : "hotels-index", + "_id" : "7", + "_score" : 0.154321, + "_source" : { + "location" : [ + 4.2, + 6.2 + ], + "parking" : "true", + "rating" : 5 + } + }, + { + "_index" : "hotels-index", + "_id" : "12", + "_score" : 0.1, + "_source" : { + "location" : [ + 5.0, + 1.0 + ], + "parking" : "true", + "rating" : 3 + } + } + ] + } +} +``` diff --git a/_search-plugins/knn/index.md b/_search-plugins/knn/index.md index d8e5c1c3f9..d360507105 100644 --- a/_search-plugins/knn/index.md +++ b/_search-plugins/knn/index.md @@ -22,7 +22,7 @@ This plugin supports three different methods for obtaining the k-nearest neighbo Approximate k-NN is the best choice for searches over large indices (i.e. hundreds of thousands of vectors or more) that require low latency. You should not use approximate k-NN if you want to apply a filter on the index before the k-NN search, which greatly reduces the number of vectors to be searched. In this case, you should use either the script scoring method or painless extensions. - For more details about this method, see [Approximate k-NN search]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/). + For more details about this method, including recommendations for which engine to use, see [Approximate k-NN search]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/). 2. **Script Score k-NN** diff --git a/_search-plugins/knn/jni-libraries.md b/_search-plugins/knn/jni-libraries.md index 052a789510..25d1556908 100644 --- a/_search-plugins/knn/jni-libraries.md +++ b/_search-plugins/knn/jni-libraries.md @@ -1,14 +1,17 @@ --- layout: default title: JNI libraries -nav_order: 6 +nav_order: 35 parent: k-NN has_children: false --- # JNI libraries -To integrate [*nmslib*'s](https://github.com/nmslib/nmslib/) and [*faiss*'s](https://github.com/facebookresearch/faiss/) Approximate k-NN functionality (implemented in C++) into the k-NN plugin (implemented in Java), we created a Java Native Interface, which lets the k-NN plugin make calls to the native libraries. We create 3 libraries: `libopensearchknn_nmslib`, the JNI library that interfaces with nmslib, `libopensearchknn_faiss`, the JNI library that interfaces with faiss, and `libopensearchknn_common`, a library containing common shared functionality between native libraries. +To integrate [nmslib](https://github.com/nmslib/nmslib/) and [faiss](https://github.com/facebookresearch/faiss/) approximate k-NN functionality (implemented in C++) into the k-NN plugin (implemented in Java), we created a Java Native Interface, which lets the k-NN plugin make calls to the native libraries. The interface includes three libraries: `libopensearchknn_nmslib`, the JNI library that interfaces with nmslib, `libopensearchknn_faiss`, the JNI library that interfaces with faiss, and `libopensearchknn_common`, a library containing common shared functionality between native libraries. + +The Lucene library is not implemented using a native library. +{: .note} The libraries `libopensearchknn_faiss` and `libopensearchknn_nmslib` are lazily loaded when they are first called in the plugin. This means that if you are only planning on using one of the libraries, the plugin never loads the other library. diff --git a/_search-plugins/knn/knn-index.md b/_search-plugins/knn/knn-index.md index 59460d8347..90f08f415a 100644 --- a/_search-plugins/knn/knn-index.md +++ b/_search-plugins/knn/knn-index.md @@ -1,7 +1,7 @@ --- layout: default title: k-NN Index -nav_order: 1 +nav_order: 5 parent: k-NN has_children: false --- @@ -53,54 +53,56 @@ However, if you intend to just use painless scripting or a k-NN score script, yo A method definition refers to the underlying configuration of the Approximate k-NN algorithm you want to use. Method definitions are used to either create a `knn_vector` field (when the method does not require training) or [create a model during training]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-model) that can then be used to [create a `knn_vector` field]({{site.url}}{{site.baseurl}}/search-plugins/knn/approximate-knn/#building-a-k-nn-index-from-a-model). A method definition will always contain the name of the method, the space_type the method is built for, the engine -(the native library) to use, and a map of parameters. +(the library) to use, and a map of parameters. Mapping Parameter | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- `name` | true | n/a | false | The identifier for the nearest neighbor method. -`space_type` | false | "l2" | false | The vector space used to calculate the distance between vectors. -`engine` | false | "nmslib" | false | The approximate k-NN library to use for indexing and search. Either "faiss" or "nmslib". +`space_type` | false | l2 | false | The vector space used to calculate the distance between vectors. +`engine` | false | nmslib | false | The approximate k-NN library to use for indexing and search. The available libraries are faiss, nmslib, and Lucene. `parameters` | false | null | false | The parameters used for the nearest neighbor method. ### Supported nmslib methods Method Name | Requires Training? | Supported Spaces | Description :--- | :--- | :--- | :--- -`hnsw` | false | "l2", "innerproduct", "cosinesimil", "l1", "linf" | Hierarchical proximity graph approach to Approximate k-NN search. For more details on the algorithm, [checkout this paper](https://arxiv.org/abs/1603.09320)! +`hnsw` | false | l2, innerproduct, cosinesimil, l1, linf | Hierarchical proximity graph approach to Approximate k-NN search. For more details on the algorithm, see this [abstract](https://arxiv.org/abs/1603.09320). -#### HNSW Parameters +#### HNSW parameters -Paramater Name | Required | Default | Updatable | Description +Parameter Name | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- -`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph, but slower indexing speed. -`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2-100. +`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed. +`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100. -**Note** --- For *nmslib*, *ef_search* is set in the [index settings](#index-settings). +For nmslib, *ef_search* is set in the [index settings](#index-settings). +{: .note} ### Supported faiss methods Method Name | Requires Training? | Supported Spaces | Description :--- | :--- | :--- | :--- -`hnsw` | false | "l2", "innerproduct"* | Hierarchical proximity graph approach to Approximate k-NN search. -`ivf` | true | "l2", "innerproduct" | Bucketing approach where vectors are assigned different buckets based on clustering and, during search, only a subset of the buckets are searched. +`hnsw` | false | l2, innerproduct | Hierarchical proximity graph approach to Approximate k-NN search. +`ivf` | true | l2, innerproduct | Bucketing approach where vectors are assigned different buckets based on clustering and, during search, only a subset of the buckets is searched. -**Note** --- For *hnsw*, "innerproduct" is not available when PQ is used. +For hnsw, "innerproduct" is not available when PQ is used. +{: .note} -#### HNSW Parameters +#### HNSW parameters -Paramater Name | Required | Default | Updatable | Description +Parameter Name | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- `ef_search` | false | 512 | false | The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches. -`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph, but slower indexing speed. -`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2-100. +`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed. +`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100. `encoder` | false | flat | false | Encoder definition for encoding vectors. Encoders can reduce the memory footprint of your index, at the expense of search accuracy. -#### IVF Parameters +#### IVF parameters -Paramater Name | Required | Default | Updatable | Description +Parameter Name | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- -`nlists` | false | 4 | false | Number of buckets to partition vectors into. Higher values may lead to more accurate searches, at the expense of memory and training latency. For more information about choosing the right value, refer to [*faiss*'s documentation](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index). -`nprobes` | false | 1 | false | Number of buckets to search over during query. Higher values lead to more accurate but slower searches. +`nlist` | false | 4 | false | Number of buckets to partition vectors into. Higher values may lead to more accurate searches at the expense of memory and training latency. For more information about choosing the right value, refer to [Guidelines to choose an index](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index). +`nprobes` | false | 1 | false | Number of buckets to search during query. Higher values lead to more accurate but slower searches. `encoder` | false | flat | false | Encoder definition for encoding vectors. Encoders can reduce the memory footprint of your index, at the expense of search accuracy. For more information about setting these parameters, please refer to [*faiss*'s documentation](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). @@ -109,13 +111,45 @@ For more information about setting these parameters, please refer to [*faiss*'s The IVF algorithm requires a training step. To create an index that uses IVF, you need to train a model with the [Train API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api#train-model), passing the IVF method definition. IVF requires that, at a minimum, there should be `nlist` training -data points, but it is [recommended to use more](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset). -Training data can either the same data that is going to be ingested or a separate set of data. +data points, but it is [recommended that you use more](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset). +Training data can be composed of either the same data that is going to be ingested or a separate dataset. + +### Supported Lucene methods + +Method Name | Requires Training? | Supported Spaces | Description +:--- | :--- | :--- | :--- +`hnsw` | false | l2, cosinesimil | Hierarchical proximity graph approach to Approximate k-NN search. + +#### HNSW parameters + +Parameter Name | Required | Default | Updatable | Description +:--- | :--- | :--- | :--- | :--- +`ef_construction` | false | 512 | false | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph but slower indexing speed.
The Lucene engine uses the proprietary term "beam_width" to describe this function, which corresponds directly to "ef_construction". To be consistent throughout OpenSearch documentation, we retain the term "ef_construction" to label this parameter. +`m` | false | 16 | false | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2 and 100.
The Lucene engine uses the proprietary term "max_connections" to describe this function, which corresponds directly to "m". To be consistent throughout OpenSearch documentation, we retain the term "m" to label this parameter. + +Lucene HNSW implementation ignores `ef_search` and dynamically sets it to the value of "k" in the search request. Therefore, there is no need to make settings for `ef_search` when using the Lucene engine. +{: .note} + +```json +{ + "type": "knn_vector", + "dimension": 100, + "method": { + "name":"hnsw", + "engine":"lucene", + "space_type": "l2", + "parameters":{ + "m":2048, + "ef_construction": 245 + } + } +} +``` ### Supported faiss encoders -You can use encoders to reduce the memory footprint of a k-NN index at the expense of search accuracy. *faiss* has -several encoder types, but currently, the plugin only supports *flat* and *pq* encoding. +You can use encoders to reduce the memory footprint of a k-NN index at the expense of search accuracy. faiss has +several encoder types, but the plugin currently only supports *flat* and *pq* encoding. An example method definition that specifies an encoder may look something like this: @@ -140,7 +174,7 @@ Encoder Name | Requires Training? | Description `flat` | false | Encode vectors as floating point arrays. This encoding does not reduce memory footprint. `pq` | true | Short for product quantization, it is a lossy compression technique that encodes a vector into a fixed size of bytes using clustering, with the goal of minimizing the drop in k-NN search accuracy. From a high level, vectors are broken up into `m` subvectors, and then each subvector is represented by a `code_size` code obtained from a code book produced during training. For more details on product quantization, here is a [great blog post](https://medium.com/dotstar/understanding-faiss-part-2-79d90b1e5388)! -#### PQ Parameters +#### PQ parameters Paramater Name | Required | Default | Updatable | Description :--- | :--- | :--- | :--- | :--- @@ -160,7 +194,7 @@ If memory is a concern, consider adding a PQ encoder to your HNSW or IVF index. ### Memory Estimation In a typical OpenSearch cluster, a certain portion of RAM is set aside for the JVM heap. The k-NN plugin allocates -native library indices to a portion of the remaining RAM. This portion's size is determined by +native library indexes to a portion of the remaining RAM. This portion's size is determined by the `circuit_breaker_limit` cluster setting. By default, the limit is set at 50%. Having a replica doubles the total number of vectors. @@ -196,7 +230,7 @@ At the moment, several parameters defined in the settings are in the deprecation Setting | Default | Updateable | Description :--- | :--- | :--- | :--- `index.knn` | false | false | Whether the index should build native library indices for the `knn_vector` fields. If set to false, the `knn_vector` fields will be stored in doc values, but Approximate k-NN search functionality will be disabled. -`index.knn.algo_param.ef_search` | 512 | true | The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches. Only available for *nmslib*. -`index.knn.algo_param.ef_construction` | 512 | false | (Deprecated in 1.0.0. Use the mapping parameters to set this value instead.) Only available for *nmslib*. Refer to mapping definition. -`index.knn.algo_param.m` | 16 | false | (Deprecated in 1.0.0. Use the mapping parameters to set this value instead.) Only available for *nmslib*. Refer to mapping definition. -`index.knn.space_type` | "l2" | false | (Deprecated in 1.0.0. Use the mapping parameters to set this value instead.) Only available for *nmslib*. Refer to mapping definition. +`index.knn.algo_param.ef_search` | 512 | true | The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches. Only available for nmslib. +`index.knn.algo_param.ef_construction` | 512 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. +`index.knn.algo_param.m` | 16 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. +`index.knn.space_type` | l2 | false | Deprecated in 1.0.0. Use the [mapping parameters](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) to set this value instead. diff --git a/_search-plugins/knn/knn-score-script.md b/_search-plugins/knn/knn-score-script.md index 1f77d8ff4f..5a87cdf7f7 100644 --- a/_search-plugins/knn/knn-score-script.md +++ b/_search-plugins/knn/knn-score-script.md @@ -1,7 +1,7 @@ --- layout: default title: Exact k-NN with scoring script -nav_order: 3 +nav_order: 20 parent: k-NN has_children: false has_math: true diff --git a/_search-plugins/knn/painless-functions.md b/_search-plugins/knn/painless-functions.md index 593fddbf22..223c192eb7 100644 --- a/_search-plugins/knn/painless-functions.md +++ b/_search-plugins/knn/painless-functions.md @@ -1,7 +1,7 @@ --- layout: default title: k-NN Painless extensions -nav_order: 4 +nav_order: 25 parent: k-NN has_children: false has_math: true diff --git a/_search-plugins/knn/performance-tuning.md b/_search-plugins/knn/performance-tuning.md index f6e28165c2..d179d99685 100644 --- a/_search-plugins/knn/performance-tuning.md +++ b/_search-plugins/knn/performance-tuning.md @@ -2,7 +2,7 @@ layout: default title: Performance tuning parent: k-NN -nav_order: 8 +nav_order: 45 --- # Performance tuning diff --git a/_search-plugins/knn/settings.md b/_search-plugins/knn/settings.md index bbcb37c6e9..cdd2e86dd6 100644 --- a/_search-plugins/knn/settings.md +++ b/_search-plugins/knn/settings.md @@ -2,7 +2,7 @@ layout: default title: Settings parent: k-NN -nav_order: 7 +nav_order: 40 --- # k-NN settings diff --git a/_search-plugins/neural-search.md b/_search-plugins/neural-search.md new file mode 100644 index 0000000000..6cf199c52a --- /dev/null +++ b/_search-plugins/neural-search.md @@ -0,0 +1,204 @@ +--- +layout: default +title: Neural Search plugin +nav_order: 200 +has_children: false +has_toc: false +redirect_from: + - /neural-search-plugin/index/ +--- + +# Neural Search plugin + +The Neural Search plugin is an experimental feature. For updates on the progress of the Neural Search plugin, or if you want to leave feedback that could help improve the feature, join the discussion in the [Neural Search forum](https://forum.opensearch.org/t/feedback-neural-search-plugin-experimental-release/11501). +{: .warning} + +The OpenSearch Neural Search plugin enables the integration of machine learning (ML) language models into your search workloads. During ingestion and search, the Neural Search plugin transforms text into vectors. Then, Neural Search uses the transformed vectors in vector-based search. + +The Neural Search plugin comes bundled with OpenSearch. For more information, see [Managing plugins]({{site.url}}{{site.baseurl}}/opensearch/install/plugins#managing-plugins). + +## Ingest data with Neural Search + +In order to ingest vectorized documents, you need to create a Neural Search _pipeline_. A pipeline consists of a series of processors that manipulate documents during ingestion, allowing the documents to be vectorized. The following API operation creates a Neural Search pipeline: + +``` +PUT _ingest/pipeline/ +``` + +In the pipeline request body, The `text_embedding` processor, the only processor supported by Neural Search, converts a document's text to vector embeddings. `text_embedding` uses `field_map`s to determine what fields from which to generate vector embeddings and also which field to store the embedding. + +### Path parameter + +Use `pipeline_name` to create a name for your Neural Search pipeline. + +### Request fields + +Field | Data type | Description +:--- | :--- | :--- +description | string | A description of the processor. +model_id | string | The ID of the model that will be used in the embedding interface. The model must be indexed in OpenSearch before it can be used in Neural Search. For more information, see [Model Serving Framework]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-serving-framework/) +input_field_name | string | The field name used to cache text for text embeddings. +output_field_name | string | The name of the field in which output text is stored. + +### Example request + +Use the following example request to create a pipeline: + +``` +PUT _ingest/pipeline/nlp-pipeline +{ + "description": "An example neural search pipeline", + "processors" : [ + { + "text_embedding": { + "model_id": "bxoDJ7IHGM14UqatWc_2j", + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +} +``` + +### Example response + +OpenSearch responds with an acknowledgment of the pipeline's creation. + +```json +PUT _ingest/pipeline/nlp-pipeline +{ + "acknowledged" : true +} +``` + +## Create an index for ingestion + +In order to use the text embedding processor defined in your pipelines, create an index with mapping data that aligns with the maps specified in your pipeline. For example, the `output_fields` defined in the `field_map` field of your processor request must map to the k-NN vector fields with a dimension that matches the model. Similarly, the `text_fields` defined in your processor should map to the `text_fields` in your index. + +### Example request + +The following example request creates an index that attaches to a Neural Search pipeline. Because the index maps to k-NN vector fields, the index setting field `index-knn` is set to `true`. Furthermore, `mapping` settings use [k-NN method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#method-definitions) to match the maps defined in the Neural Search pipeline. + +```json +PUT /my-nlp-index-1 +{ + "settings": { + "index.knn": true, + "default_pipeline": "" + }, + "mappings": { + "properties": { + "passage_embedding": { + "type": "knn_vector", + "dimension": int, + "method": { + "name": "string", + "space_type": "string", + "engine": "string", + "parameters": json_object + } + }, + "passage_text": { + "type": "text" + }, + } + } +} +``` + +### Example response + +OpenSearch responds with information about your new index: + +```json +{ + "acknowledged" : true, + "shards_acknowledged" : true, + "index" : "my-nlp-index-1" +} +``` + +## Ingest documents into Neural Search + +Document ingestion is managed by OpenSearch's [Ingest API]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/), similarly to other OpenSearch indexes. For example, you can ingest a document that contains the `passage_text: "Hello world"` with a simple POST method: + +```json +POST /my-nlp-index-1/_doc +{ + "passage_text": "Hello world" +} +``` + +With the text_embedding processor in place through a Neural Search pipeline, the example indexes "Hello world" as a `text_field` and converts "Hello world" into an associated k-NN vector field. + +## Search a neural index + +If you want to use a language model to convert a text query into a k-NN vector query, use the `neural` query fields in your query. The neural query request fields can be used in both the [k-NN plugin API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api/#search-model) and [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/). Furthermore, you can use a [k-NN search filter]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/) to refine your neural search query. + + + +### Neural request fields + +Include the following request fields under the `neural` field in your query: + +Field | Data type | Description +:--- | :--- | :--- +vector_field | string | The vector field against which to run a search query. +query_text | string | The query text from which to produce queries. +model_id | string | The ID of the model that will be used in the embedding interface. The model must be indexed in OpenSearch before it can be used in Neural Search. +k | integer | The number of results the k-NN search returns. + + +### Example request + +The following example request uses a search query that returns vectors for the "Hello World" query text: + + +```json +GET my_index/_search +{ + "query": { + "bool" : { + "filter": { + "range": { + "distance": { "lte" : 20 } + } + }, + "should" : [ + { + "script_score": { + "query": { + "neural": { + "passage_vector": { + "query_text": "Hello world", + "model_id": "xzy76xswsd", + "k": 100 + } + } + }, + "script": { + "source": "_score * 1.5" + } + } + } + , + { + "script_score": { + "query": { + "match": { "passage_text": "Hello world" } + }, + "script": { + "source": "_score * 1.7" + } + } + } + ] + } + } +} +``` + + + + diff --git a/_search-plugins/point-in-time-api.md b/_search-plugins/point-in-time-api.md new file mode 100644 index 0000000000..69824f1671 --- /dev/null +++ b/_search-plugins/point-in-time-api.md @@ -0,0 +1,272 @@ +--- +layout: default +title: Point in Time API +nav_order: 59 +has_children: false +parent: Point in Time +redirect_from: + - /opensearch/point-in-time-api/ +--- + +# Point in Time API + +Use the [Point in Time (PIT)]({{site.url}}{{site.baseurl}}/opensearch/point-in-time/) API to manage PITs. + +--- + +#### Table of contents +- TOC +{:toc} + +--- + +## Create a PIT +Introduced 2.4 +{: .label .label-purple } + +Creates a PIT. The `keep_alive` query parameter is required; it specifies how long to keep a PIT. + +### Path and HTTP methods + +```json +POST //_search/point_in_time?keep_alive=1h&routing=&expand_wildcards=&preference= +``` + +### Path parameters + +Parameter | Data type | Description +:--- | :--- | :--- +target_indexes | String | The name(s) of the target index(es) for the PIT. May contain a comma-separated list or a wildcard index pattern. + +### Query parameters + +Parameter | Data type | Description +:--- | :--- | :--- +keep_alive | Time | The amount of time to keep the PIT. Every time you access a PIT by using the Search API, the PIT lifetime is extended by the amount of time equal to the `keep_alive` parameter. Required. +preference | String | The node or the shard used to perform the search. Optional. Default is random. +routing | String | Specifies to route search requests to a specific shard. Optional. Default is the document's `_id`. +expand_wildcards | String | The type of index that can match the wildcard pattern. Supports comma-separated values. Valid values are the following:
- `all`: Match any index or data stream, including hidden ones.
- `open`: Match open, non-hidden indexes or non-hidden data streams.
- `closed`: Match closed, non-hidden indexes or non-hidden data streams.
- `hidden`: Match hidden indexes or data streams. Must be combined with `open`, `closed` or both `open` and `closed`.
- `none`: No wildcard patterns are accepted.
Optional. Default is `open`. +allow_partial_pit_creation | Boolean | Specifies whether to create a PIT with partial failures. Optional. Default is `true`. + +#### Example request + +```json +POST /my-index-1/_search/point_in_time?keep_alive=100m +``` + +#### Example response + +```json +{ + "pit_id": "o463QQEPbXktaW5kZXgtMDAwMDAxFnNOWU43ckt3U3IyaFVpbGE1UWEtMncAFjFyeXBsRGJmVFM2RTB6eVg1aVVqQncAAAAAAAAAAAIWcDVrM3ZIX0pRNS1XejE5YXRPRFhzUQEWc05ZTjdyS3dTcjJoVWlsYTVRYS0ydwAA", + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "creation_time": 1658146050064 +} +``` + +### Response fields + +Field | Data type | Description +:--- | :--- | :--- +pit_id | [Base64 encoded binary]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/binary) | The PIT ID. +creation_time | long | The time the PIT was created, in milliseconds since the epoch. + +## Extend a PIT time + +You can extend a PIT time by providing a `keep_alive` parameter in the `pit` object when you perform a search: + +```json +GET /_search +{ + "size": 10000, + "query": { + "match" : { + "user.id" : "elkbee" + } + }, + "pit": { + "id": "46ToAwMDaWR5BXV1aWQyKwZub2RlXzMAAAAAAAAAACoBYwADaWR4BXV1aWQxAgZub2RlXzEAAAAAAAAAAAEBYQADaWR5BXV1aWQyKgZub2RlXzIAAAAAAAAAAAwBYgACBXV1aWQyAAAFdXVpZDEAAQltYXRjaF9hbGw_gAAAAA==", + "keep_alive": "100m" + }, + "sort": [ + {"@timestamp": {"order": "asc", "format": "strict_date_optional_time_nanos"}}, + {"_shard_doc": "desc"} + ], + "search_after": [ + "2021-05-20T05:30:04.832Z" + ] +} +``` + +The `keep_alive` parameter in a search request is optional. It specifies the amount by which to extend the time to keep a PIT. +{: .note} + +## List all PITs +Introduced 2.4 +{: .label .label-purple } + +Returns all PITs in the OpenSearch cluster. + +### Cross-cluster behavior + +The List All PITs API returns only local PITs or mixed PITs (PITs created in both local and remote clusters). It does not return fully remote PITs. + +#### Example request + +```json +GET /_search/point_in_time/_all +``` + +#### Example response + +```json +{ + "pits": [ + { + "pit_id": "o463QQEPbXktaW5kZXgtMDAwMDAxFnNOWU43ckt3U3IyaFVpbGE1UWEtMncAFjFyeXBsRGJmVFM2RTB6eVg1aVVqQncAAAAAAAAAAAEWcDVrM3ZIX0pRNS1XejE5YXRPRFhzUQEWc05ZTjdyS3dTcjJoVWlsYTVRYS0ydwAA", + "creation_time": 1658146048666, + "keep_alive": 6000000 + }, + { + "pit_id": "o463QQEPbXktaW5kZXgtMDAwMDAxFnNOWU43ckt3U3IyaFVpbGE1UWEtMncAFjFyeXBsRGJmVFM2RTB6eVg1aVVqQncAAAAAAAAAAAIWcDVrM3ZIX0pRNS1XejE5YXRPRFhzUQEWc05ZTjdyS3dTcjJoVWlsYTVRYS0ydwAA", + "creation_time": 1658146050064, + "keep_alive": 6000000 + } + ] +} +``` + +### Response fields + +Field | Data type | Description +:--- | :--- | :--- +pits | Array of JSON objects | The list of all PITs. + +Each PIT object contains the following fields. + +Field | Data type | Description +:--- | :--- | :--- +pit_id | [Base64 encoded binary]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/binary) | The PIT ID. +creation_time | long | The time the PIT was created, in milliseconds since the epoch. +keep_alive | long | The amount of time to keep the PIT, in milliseconds. + +## Delete PITs +Introduced 2.4 +{: .label .label-purple } + +Deletes one, several, or all PITs. PITs are automatically deleted when the `keep_alive` time period elapses. However, to deallocate resources, you can delete a PIT using the Delete PIT API. The Delete PIT API supports deleting a list of PITs by ID or deleting all PITs at once. + +### Cross-cluster behavior + +The Delete PITs by ID API fully supports deleting cross-cluster PITs. + +The Delete All PITs API deletes only local PITs or mixed PITs (PITs created in both local and remote clusters). It does not delete fully remote PITs. + +#### Sample Request: Delete all PITs + +```json +DELETE /_search/point_in_time/_all +``` + +If you want to delete one or several PITs, specify their PIT IDs in the request body. + +### Request fields + +Field | Data type | Description +:--- | :--- | :--- +pit_id | [Base64 encoded binary]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/binary) or an array of binaries | The PIT IDs of the PITs to be deleted. Required. + +#### Example request: Delete PITs by ID + +```json +DELETE /_search/point_in_time + +{ + "pit_id": [ + "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAEWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA", + "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAIWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA" + ] +} +``` + +#### Example response + +For each PIT, the response contains a JSON object with a PIT ID and a `successful` field that specifies whether the deletion was successful. Partial failures are treated as failures. + +```json +{ + "pits": [ + { + "successful": true, + "pit_id": "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAEWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA" + }, + { + "successful": false, + "pit_id": "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAIWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA" + } + ] +} +``` + +### Response fields + +Field | Data type | Description +:--- | :--- | :--- +successful | Boolean | Whether the delete operation was successful. +pit_id | [Base64 encoded binary]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/binary) | The PIT ID of the PIT to be deleted. + +## PIT segments +Introduced 2.4 +{: .label .label-purple } + +Similarly to the [CAT Segments API]({{site.url}}{{site.baseurl}}/api-reference/cat/cat-segments), the PIT Segments API provides low-level information about the disk utilization of a PIT by describing its Lucene segments. The PIT Segments API supports listing segment information of a specific PIT by ID or of all PITs at once. + +#### Example request: PIT segments of all PITs + +```json +GET /_cat/pit_segments/_all +``` + +If you want to list segments for one or several PITs, specify their PIT IDs in the request body. + +### Request fields + +Field | Data type | Description +:--- | :--- | :--- +pit_id | [Base64 encoded binary]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/binary) or an array of binaries | The PIT IDs of the PITs whose segments are to be listed. Required. + +#### Example request: PIT segments of PITs by ID + +```json +GET /_cat/pit_segments + +{ + "pit_id": [ + "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAEWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA", + "o463QQEPbXktaW5kZXgtMDAwMDAxFkhGN09fMVlPUkVPLXh6MUExZ1hpaEEAFjBGbmVEZHdGU1EtaFhhUFc4ZkR5cWcAAAAAAAAAAAIWaXBPNVJtZEhTZDZXTWFFR05waXdWZwEWSEY3T18xWU9SRU8teHoxQTFnWGloQQAA" + ] +} +``` + +#### Example response + +```json +index shard prirep ip segment generation docs.count docs.deleted size size.memory committed searchable version compound +index1 0 r 10.212.36.190 _0 0 4 0 3.8kb 1364 false true 8.8.2 true +index1 1 p 10.212.36.190 _0 0 3 0 3.7kb 1364 false true 8.8.2 true +index1 2 r 10.212.74.139 _0 0 2 0 3.6kb 1364 false true 8.8.2 true +``` + +## PIT settings + +You can specify the following settings for a PIT. + +Setting | Description | Default +:--- | :--- | :--- +point_in_time.max_keep_alive | A cluster-level setting that specifies the maximum value for the `keep_alive` parameter. | 24h +search.max_open_pit_context | A node-level setting that specifies the maximum number of open PIT contexts for the node. | 300 \ No newline at end of file diff --git a/_search-plugins/point-in-time.md b/_search-plugins/point-in-time.md new file mode 100644 index 0000000000..4453dde3dd --- /dev/null +++ b/_search-plugins/point-in-time.md @@ -0,0 +1,159 @@ +--- +layout: default +title: Point in Time +nav_order: 58 +has_children: true +has_toc: false +redirect_from: + - /opensearch/point-in-time/ +--- + +# Point in Time + +Point in Time (PIT) lets you run different queries against a dataset that is fixed in time. + +Normally, if you run a query on an index multiple times, the same query may return different results because documents are continually indexed, updated, and deleted. If you need to run a query against the same data, you can preserve that data's state by creating a PIT. The main use of the PIT feature is to couple it with the `search_after` functionality for deep pagination of search results. + +## Paginating search results + +Besides the PIT functionality, there are three ways to [paginate search results]({{site.url}}{{site.baseurl}}/opensearch/search/paginate) in OpenSearch: using the Scroll API, specifying `from` and `size` parameters for your search, and using the `search_after` functionality. However, all three have limitations: + +- The Scroll API's search results are frozen at the moment of the request, but they are bound to a particular query. Additionally, scroll can only move forward in the search, so if a request for a page fails, the subsequent request skips that page and returns the following one. +- If you specify the `from` and `size` parameters for your search, the search results are not frozen in time, so they may be inconsistent because of documents being indexed or deleted. The `from` and `size` feature is not recommended for deep pagination because every page request requires processing of all results and filtering them for the requested page. +- The `search_after` search results are not frozen in time, so they may be inconsistent because of concurrent document indexing or deletion. + +The PIT functionality does not have the limitations of other pagination methods, because PIT search is not bound to a query, and it supports consistent pagination going forward and backward. If you have looked at page one of your results and are now on page two, you will see the same page one if you go back. + +## PIT search + +PIT search has the same capabilities as regular search, except PIT search acts on an older dataset, while a regular search acts on a live dataset. PIT search is not bound to a query, so you can run different queries on the same dataset, which is frozen in time. + +You can use the [Create PIT API]({{site.url}}{{site.baseurl}}/opensearch/point-in-time-api#create-a-pit) to create a PIT. When you create a PIT for a set of indexes, OpenSearch locks a set of segments for those indexes, freezing them in time. On a lower level, none of the resources required for this PIT are modified or deleted. If the segments that are part of a PIT are merged, OpenSearch retains a copy of those segments for the period of time specified at PIT creation by the `keep_alive` parameter. + +The create PIT operation returns a PIT ID, which you can use to run multiple queries on the frozen dataset. Even though the indexes continue to ingest data and modify or delete documents, the PIT references the data that has not changed since the PIT creation. When your query contains a PIT ID, you don't need to pass the indexes to the search because it will use that PIT. A search with a PIT ID will produce exactly the same result when you run it multiple times. + +In case of a cluster or node failure, all PIT data is lost. +{: .note} + +## Pagination with PIT and search_after + +When you run a query with a PIT ID, you can use the `search_after` parameter to retrieve the next page of results. This gives you control over the order of documents in the pages of results. + +Run a search query with a PIT ID: + +```json +GET /_search +{ + "size": 10000, + "query": { + "match" : { + "user.id" : "elkbee" + } + }, + "pit": { + "id": "46ToAwMDaWR5BXV1aWQyKwZub2RlXzMAAAAAAAAAACoBYwADaWR4BXV1aWQxAgZub2RlXzEAAAAAAAAAAAEBYQADaWR5BXV1aWQyKgZub2RlXzIAAAAAAAAAAAwBYgACBXV1aWQyAAAFdXVpZDEAAQltYXRjaF9hbGw_gAAAAA==", + "keep_alive": "100m" + }, + "sort": [ + {"@timestamp": {"order": "asc", "format": "strict_date_optional_time_nanos"}}, + {"_shard_doc": "desc"} + ] +} +``` + +The response contains the first 10,000 documents that match the query. To get the next set of documents, run the same query with the last document's sort values as the `search_after` parameter, keeping the same `sort` and `pit.id`. You can use the optional `keep_alive` parameter to extend the PIT time: + +```json +GET /_search +{ + "size": 10000, + "query": { + "match" : { + "user.id" : "elkbee" + } + }, + "pit": { + "id": "46ToAwMDaWR5BXV1aWQyKwZub2RlXzMAAAAAAAAAACoBYwADaWR4BXV1aWQxAgZub2RlXzEAAAAAAAAAAAEBYQADaWR5BXV1aWQyKgZub2RlXzIAAAAAAAAAAAwBYgACBXV1aWQyAAAFdXVpZDEAAQltYXRjaF9hbGw_gAAAAA==", + "keep_alive": "100m" + }, + "sort": [ + {"@timestamp": {"order": "asc", "format": "strict_date_optional_time_nanos"}}, + {"_shard_doc": "desc"} + ], + "search_after": [ + "2021-05-20T05:30:04.832Z" + ] +} +``` + +## Search slicing + +Using `search_after` with PIT for pagination gives you control over ordering of the results. If you don't need results in any specific order, or if you want the ability to jump from a page to a non-consecutive page, you can use search slicing. Search slicing splits a PIT search into multiple slices that can be consumed independently by a client application. + +For example, if you have a PIT search query that has 1,000,000 results and you want to return 50,000 results at a time, your client application has to make 20 consecutive calls to receive each batch of results. If you use search slicing, you can parallelize these 20 calls. In your multithreaded client application you can use five slices for each PIT. As a result, you will have 5 10,000-hit slices that can be consumed by five different threads in your client, instead of having a single thread consume 50,000 results. + +To use search slicing, you have to specify two parameters: +- `slice.id` is the slice ID you are requesting. +- `slice.max` is the number of slices to break the search response into. + +The following PIT search query illustrates search slicing: + +```json + +GET /_search +{ + "slice": { + "id": 0, // id is the slice (page) number being requested. In every request we can only query for one slice + "max": 2 // max is the total number of slices (pages) the search response will be broken down into + }, + "query": { + "match": { + "message": "foo" + } + }, + "pit": { + "id": "46ToAwMDaWR5BXV1aWQyKwZub2RlXzMAAAAAAAAAACoBYwADaWR4BXV1aWQxAgZub2RlXzEAAAAAAAAAAAEBYQADaWR5BXV1aWQyKgZub2RlXzIAAAAAAAAAAAwBYgACBXV1aWQyAAAFdXVpZDEAAQltYXRjaF9hbGw_gAAAAA==" + } +} +``` + +In every request you can only query for one slice, so the next query will be the same as the previous one, except the `slice.id` will be `1`. + +## Security model + +This section describes the permissions needed to use PIT API operations if you are running OpenSearch with the security plugin enabled. + +Users can access all PIT API operations using the `point_in_time_full_access` role. If this role doesn't meet your needs, mix and match individual PIT permissions to suit your use case. Each action corresponds to an operation in the REST API. For example, the `indices:data/read/point_in_time/create` permission lets you create a PIT. The following are the possible permissions: + +- `indices:data/read/point_in_time/create` – Create API +- `indices:data/read/point_in_time/delete` – Delete API +- `indices:data/read/point_in_time/readall` – List All PITs API +- `indices:data/read/search` – Search API +- `indices:monitor/point_in_time/segments` – PIT Segments API + +For `all` API operations, such as list all and delete all, the user needs the all indexes (*) permission. For API operations such as search, create PIT, or delete list, the user only needs individual index permissions. + +The PIT IDs always contain the underlying (resolved) indexes when saved. The following sections describe the required permissions for aliases and data streams. + +### Alias permissions + +For aliases, users must have either index **or** alias permissions for any PIT operation. + +### Data stream permissions + +For data streams, users must have both the data stream **and** the data stream's backing index permissions for any PIT operation. For example, the user must have permissions for the `data-stream-11` data stream and for its backing index `.ds-my-data-stream11-000001`. + +If users have the data stream permissions only, they will be able to create a PIT, but they will not be able to use the PIT ID for other operations, such as search, without the backing index permissions. + +## API + +The following table lists all [Point in Time API]({{site.url}}{{site.baseurl}}/opensearch/point-in-time-api) functions. + +Function | API | Description +:--- | :--- | :--- +[Create PIT]({{site.url}}{{site.baseurl}}/opensearch/point-in-time-api#create-a-pit) | `POST //_search/point_in_time?keep_alive=1h` | Creates a PIT. +[List PIT]({{site.url}}{{site.baseurl}}/opensearch/point-in-time-api#list-all-pits) | `GET /_search/point_in_time/_all` | Lists all PITs. +[Delete PIT]({{site.url}}{{site.baseurl}}/opensearch/point-in-time-api#delete-pits) | `DELETE /_search/point_in_time`
`DELETE /_search/point_in_time/_all` | Deletes a PIT or all PITs. +[PIT segments]({{site.url}}{{site.baseurl}}/opensearch/point-in-time-api#pit-segments) | `GET /_cat/pit_segments/_all` | Provides information about the disk utilization of a PIT by describing its Lucene segments. + +For information about the relevant cluster and node settings, see [PIT Settings]({{site.url}}{{site.baseurl}}/opensearch/point-in-time-api#pit-settings). diff --git a/_search-plugins/querqy/index.md b/_search-plugins/querqy/index.md index 7f97fada8f..3abd12dcbd 100644 --- a/_search-plugins/querqy/index.md +++ b/_search-plugins/querqy/index.md @@ -4,7 +4,7 @@ title: Querqy has_children: false redirect_from: - /search-plugins/querqy/ -nav_order: 10 +nav_order: 210 --- # Querqy @@ -13,51 +13,34 @@ Querqy is a community plugin for query rewriting that helps to solve relevance i ## Querqy plugin installation -Querqy is currently only compatible with OpenSearch 1.3.1 -{: .note } +The Querqy plugin is now available for OpenSearch 2.3.0. Run the following command to install the Querqy plugin. -1. The Querqy plugin code is located here: [querqy-opensearch](https://github.com/querqy/querqy-opensearch). To download the plugin code ZIP file, select the green "Code" button, then select "Download ZIP" +````bash +./bin/opensearch-plugin install \ + "https://repo1.maven.org/maven2/org/querqy/opensearch-querqy/1.0.os2.3.0/opensearch-querqy-1.0.os2.3.0.zip" +```` -1. Install JDK 11. On Amazon Linux 2, install JDK11 with the following command: +Answer `yes` to the security prompts during the installation as Querqy requires additional permissions to load query rewriters. - ```bash - sudo yum install java-11-amazon-corretto - ``` +After installing the Querqy plugin you can find comprehensive documentation on the Querqy.org site: [Querqy](https://docs.querqy.org/querqy/index.html) -1. Uncompress the ZIP file: +## Path and HTTP methods - ```bash - unzip querqy-opensearch-main.zip - ``` +``` +POST /myindex/_search +``` -1. Change to the uncompressed Querqy directory: +## Example query - ```bash - cd querqy-opensearch-main - ``` - -1. Compile the plugin: - - ```bash - ./gradlew build - ``` - -1. The compiled plugin is stored in this directory: - - ```bash - /path/to/file/querqy-opensearch-main/build/distributions/opensearch-querqy-1.3.1.0.zip` - ``` - -1. The compiled Querqy plugin is installed the same as [any OpenSearch plugin](https://opensearch.org/docs/latest/opensearch/install/plugins/#install-a-plugin): - - ```bash - /path/to/opensearch/bin/opensearch-plugin install file:///path/to/file/opensearch-querqy-1.3.1.0.zip - ``` - -1. Reboot the OpenSearch node: - - ```bash - sudo reboot - ``` - -After installing the Querqy plugin you can find comprehensive documentation on the Querqy.org site: [Querqy](https://docs.querqy.org/querqy/index.html) \ No newline at end of file +````json +{ + "query": { + "querqy": { + "matching_query": { + "query": "books" + }, + "query_fields": [ "title^3.0", "words^2.1", "shortSummary"] + } + } +} +```` \ No newline at end of file diff --git a/_search-plugins/search-relevance/index.md b/_search-plugins/search-relevance/index.md new file mode 100644 index 0000000000..7c020952ea --- /dev/null +++ b/_search-plugins/search-relevance/index.md @@ -0,0 +1,157 @@ +--- +layout: default +title: Search relevance +nav_order: 55 +has_children: false +has_toc: false +redirect_from: + - /search-plugins/search-relevance/ +--- + +# Compare search results + +Compare Search Results is an experimental feature. For updates on the progress of Compare Search Results and other search relevance features, or if you want to leave feedback that could help improve the feature, join the [discussion on the OpenSearch forum](https://forum.opensearch.org/t/feedback-experimental-feature-compare-search-results/11331). +{: .warning} + +Compare Search Results is the first search relevance feature in OpenSearch. It lets you compare search results from two queries side by side to determine whether one query produces better results than the other. Using this tool, you can evaluate search quality by experimenting with queries. + +For example, you can see how results change when you apply one of the following query changes: + +- Weighting different fields differently +- Different stemming or lemmatization strategies +- Shingling + +## Prerequisites + +Before you get started, you must index data in OpenSearch. To learn how to create a new index, see [Index data]({{site.url}}{{site.baseurl}}/opensearch/index-data). + +Alternatively, you can add sample data in OpenSearch Dashboards using the following steps: + +1. On the top menu bar, go to **OpenSearch Dashboards > Overview**. +1. Select **View app directory**. +1. Select **Add sample data**. +1. Choose one of the built-in datasets and select **Add data**. + +## Using Compare Search Results in OpenSearch Dashboards + +To compare search results in OpenSearch Dashboards, perform the following steps. + +**Step 1:** On the top menu bar, go to **OpenSearch Plugins > Search Relevance**. + +**Step 2:** Enter the search text in the search bar. + +**Step 3:** Select an index for **Query 1** and enter a query (request body only) in [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl). The `GET` HTTP method and the `_search` endpoint are implicit. Use the `%SearchText%` variable to refer to the text in the search bar. + +The following is an example query: + +```json +{ + "query": { + "multi_match": { + "query": "%SearchText%", + "fields": [ "description", "item_name" ] + } + } +} +``` + +**Step 4:** Select an index for **Query 2** and enter a query (request body only). + +The following example query boosts the `title` field in search results: + +```json +{ + "query": { + "multi_match": { + "query": "%SearchText%", + "fields": [ "description", "item_name^3" ] + } + } +} +``` + +**Step 5:** Select **Search** and compare the results in **Result 1** and **Result 2**. + +The following example screen shows a search for the word "cup" in the `description` and `item_name` fields with and without boosting the `item_name`: + +Compare search results{: .img-fluid } + +If a result in Result 1 appears in Result 2, the `Up` and `Down` indicators below the result number signify how many places the result moved up or down compared to the same result in Result 2. In this example, the document with the ID 2 is `Up 1` place in Result 2 compared to Result 1 and `Down 1` place in Result 1 compared to Result 2. + +## Changing the number of results + +By default, OpenSearch returns the top 10 results. To change the number of returned results to a different value, specify the `size` parameter in the query: + +```json +{ + "size": 15, + "query": { + "multi_match": { + "query": "%SearchText%", + "fields": [ "title^3", "text" ] + } + } +} +``` + +Setting `size` to a high value (for example, larger than 250 documents) may degrade performance. +{: .note} + +You cannot save a given comparison for future use, so Compare Search Results is not suitable for systematic testing. +{: .note} + +## Comparing OpenSearch search results with re-ranked results + +One use case for Compare Search Results is to compare raw OpenSearch results with the same results processed by a re-ranking application. An example of such a re-ranker is **Kendra Intelligent Ranking for OpenSearch**, contributed by the Amazon Kendra team. This plugin takes search results from OpenSearch and applies Amazon Kendra’s semantic relevance rankings calculated using vector embeddings and other semantic search techniques. For many applications, this provides better result rankings. + +To try Kendra Intelligent Ranking, you must first set up the Amazon Kendra service. To get started, see [Amazon Kendra](https://aws.amazon.com/kendra/). For detailed information, including plugin setup instructions, see [Intelligently ranking OpenSearch (self managed) results using Amazon Kendra](https://docs.aws.amazon.com/kendra/latest/dg/opensearch-rerank.html). + +Once you've set up Kendra Intelligent Ranking, enter a query in **Query 1** and enter the same query using Kendra Intelligent Ranking in **Query 2**. Then compare the search results from OpenSearch and Amazon Kendra. + +### Example + +The following example searches for the text "snacking nuts" in the `abo` index. The documents in the index contain snack descriptions in the `bullet_point` array. + +OpenSearch Intelligent Ranking query{: .img-fluid } + +1. Enter `snacking nuts` in the search bar. +1. Enter the following query, which searches the `bullet_point` field for the search text "snacking nuts", in **Query 1**: + + ```json + { + "query": { + "match": { + "bullet_point": "%SearchText%" + } + }, + "size": 25 + } + ``` +1. Enter the same query with intelligent ranking in **Query 2**: + + ```json + { + "query" : { + "match" : { + "bullet_point": "%SearchText%" + } + }, + "size": 25, + "ext": { + "search_configuration":{ + "result_transformer" : { + "kendra_intelligent_ranking": { + "order": 1, + "properties": { + "title_field": "item_name", + "body_field": "bullet_point" + } + } + } + } + } + } + ``` + + In the preceding query, `body_field` refers to the body field of the documents in the index, which Kendra Intelligent Ranking uses to rank the results. The `body_field` is required, while the `title_field` is optional. +1. Select **Search** and compare the results in **Result 1** and **Result 2**. \ No newline at end of file diff --git a/_opensearch/search-template.md b/_search-plugins/search-template.md similarity index 97% rename from _opensearch/search-template.md rename to _search-plugins/search-template.md index 476e804932..3b9bc7cc7b 100644 --- a/_opensearch/search-template.md +++ b/_search-plugins/search-template.md @@ -2,6 +2,8 @@ layout: default title: Search templates nav_order: 50 +redirect_from: + - /opensearch/search-template/ --- # Search templates @@ -205,6 +207,15 @@ POST _render/template } ``` +The following render operations are supported: + +```json +GET /_render/template +POST /_render/template +GET /_render/template/ +POST /_render/template/ +``` + ## Advanced parameter conversion with search templates You have a lot of different syntax options in Mustache to transpose the input parameters into a query. diff --git a/_opensearch/search/autocomplete.md b/_search-plugins/searching-data/autocomplete.md similarity index 99% rename from _opensearch/search/autocomplete.md rename to _search-plugins/searching-data/autocomplete.md index 36276ba477..ce867ed415 100644 --- a/_opensearch/search/autocomplete.md +++ b/_search-plugins/searching-data/autocomplete.md @@ -3,6 +3,8 @@ layout: default title: Autocomplete parent: Searching data nav_order: 24 +redirect_from: + - /opensearch/search/autocomplete/ --- # Autocomplete functionality diff --git a/_opensearch/search/did-you-mean.md b/_search-plugins/searching-data/did-you-mean.md similarity index 100% rename from _opensearch/search/did-you-mean.md rename to _search-plugins/searching-data/did-you-mean.md diff --git a/_opensearch/search/highlight.md b/_search-plugins/searching-data/highlight.md similarity index 99% rename from _opensearch/search/highlight.md rename to _search-plugins/searching-data/highlight.md index 52db512cbb..7b312e563e 100644 --- a/_opensearch/search/highlight.md +++ b/_search-plugins/searching-data/highlight.md @@ -3,6 +3,8 @@ layout: default title: Highlight query matches parent: Searching data nav_order: 23 +redirect_from: + - /opensearch/search/highlight/ --- # Highlight query matches diff --git a/_opensearch/search/index.md b/_search-plugins/searching-data/index.md similarity index 98% rename from _opensearch/search/index.md rename to _search-plugins/searching-data/index.md index 35c6671cd6..7e1c5a7eea 100644 --- a/_opensearch/search/index.md +++ b/_search-plugins/searching-data/index.md @@ -1,7 +1,7 @@ --- layout: default title: Searching data -nav_order: 20 +nav_order: 5 has_children: true has_toc: false redirect_from: /opensearch/ux/ diff --git a/_opensearch/search/paginate.md b/_search-plugins/searching-data/paginate.md similarity index 99% rename from _opensearch/search/paginate.md rename to _search-plugins/searching-data/paginate.md index 660a99f2a5..a43cfac782 100644 --- a/_opensearch/search/paginate.md +++ b/_search-plugins/searching-data/paginate.md @@ -3,6 +3,8 @@ layout: default title: Paginate results parent: Searching data nav_order: 21 +redirect_from: + - /opensearch/search/paginate/ --- ## Paginate results diff --git a/_opensearch/search/sort.md b/_search-plugins/searching-data/sort.md similarity index 99% rename from _opensearch/search/sort.md rename to _search-plugins/searching-data/sort.md index dac96d175a..fa4875d32f 100644 --- a/_opensearch/search/sort.md +++ b/_search-plugins/searching-data/sort.md @@ -3,6 +3,8 @@ layout: default title: Sort results parent: Searching data nav_order: 22 +redirect_from: + - /opensearch/search/sort/ --- ## Sort results diff --git a/_search-plugins/sql/full-text.md b/_search-plugins/sql/full-text.md index 9c60692801..ce72cc149c 100644 --- a/_search-plugins/sql/full-text.md +++ b/_search-plugins/sql/full-text.md @@ -148,7 +148,7 @@ You can specify the following options for `MULTI_MATCH` in any order: - `zero_terms_query` - `boost` -Please, refer to `multi_match` query [documentation]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text/#multi-match) for parameter description and supported values. +Please, refer to `multi_match` query [documentation](#multi-match) for parameter description and supported values. ### For example, REST API search for `Dale` in either the `firstname` or `lastname` fields: diff --git a/_clients/cli.md b/_tools/cli.md similarity index 98% rename from _clients/cli.md rename to _tools/cli.md index 01a16593b5..04371d67cd 100644 --- a/_clients/cli.md +++ b/_tools/cli.md @@ -1,8 +1,10 @@ --- layout: default title: OpenSearch CLI -nav_order: 52 +nav_order: 70 has_children: false +redirect_from: + - /clients/cli/ --- # OpenSearch CLI diff --git a/_clients/grafana.md b/_tools/grafana.md similarity index 95% rename from _clients/grafana.md rename to _tools/grafana.md index 97e35de40e..16a899d82e 100644 --- a/_clients/grafana.md +++ b/_tools/grafana.md @@ -1,7 +1,7 @@ --- layout: default title: Grafana -nav_order: 150 +nav_order: 200 has_children: false --- diff --git a/_clients/agents-and-ingestion-tools/index.md b/_tools/index.md similarity index 55% rename from _clients/agents-and-ingestion-tools/index.md rename to _tools/index.md index 4eab146acf..b669d45d0b 100644 --- a/_clients/agents-and-ingestion-tools/index.md +++ b/_tools/index.md @@ -1,25 +1,25 @@ --- layout: default -title: Agents and ingestion tools -nav_order: 140 +title: Tools +nav_order: 50 has_children: false -has_toc: false redirect_from: - /clients/agents-and-ingestion-tools/ --- -# Agents and ingestion tools +# OpenSearch tools -Historically, many multiple popular agents and ingestion tools have worked with Elasticsearch OSS, such as Beats, Logstash, Fluentd, FluentBit, and OpenTelemetry. OpenSearch aims to continue to support a broad set of agents and ingestion tools, but not all have been tested or have explicitly added OpenSearch compatibility. +This section provides documentation for OpenSearch-supported tools, including: -Previously, an intermediate compatibility solution was available. OpenSearch had a setting that instructed the cluster to return version 7.10.2 rather than its actual version. +- [Agents and ingestion tools](#agents-and-ingestion-tools) +- [OpenSearch CLI](#opensearch-cli) +- [OpenSearch Kubernetes operator](#opensearch-kubernetes-operator) -The override main response setting `compatibility.override_main_response_version` is deprecated from OpenSearch version 1.x and removed from OpenSearch 2.0.0. This setting is no longer supported for compatibility with legacy clients. -{: .note} +## Agents and ingestion tools - + Logstash OSS 8.0 introduces a breaking change where all plugins run in ECS compatibility mode by default. If you use a compatible [OSS client](#compatibility-matrices) you must override the default value to maintain legacy behavior: ```yml ecs_compatibility => disabled ``` -## Downloads +### Downloads You can download the OpenSearch output plugin for Logstash from [OpenSearch downloads](https://opensearch.org/downloads.html). The Logstash output plugin is compatible with OpenSearch and Elasticsearch OSS (7.10.2 or lower). @@ -70,30 +70,41 @@ Some users report compatibility issues with ingest pipelines on these versions o ### Compatibility Matrix for Logstash -| | Logstash OSS 7.x to 7.11.x | Logstash OSS 7.12.x\* | Logstash 7.13.x-7.16.x without OpenSearch output plugin | Logstash 7.13.x-7.16.x with OpenSearch output plugin | Logstash 8.x+ with OpenSearch output plugin +| | Logstash OSS 7.0.0 to 7.11.x | Logstash OSS 7.12.x\* | Logstash 7.13.x-7.16.x without OpenSearch output plugin | Logstash 7.13.x-7.16.x with OpenSearch output plugin | Logstash 8.x+ with OpenSearch output plugin | :---| :--- | :--- | :--- | :--- | :--- | -| Elasticsearch OSS 7.x to 7.9.x | *Yes* | *Yes* | *No* | *Yes* | *Yes* | +| Elasticsearch OSS 7.0.0 to 7.9.x | *Yes* | *Yes* | *No* | *Yes* | *Yes* | | Elasticsearch OSS 7.10.2 | *Yes* | *Yes* | *No* | *Yes* | *Yes* | -| ODFE 1.x to 1.12 | *Yes* | *Yes* | *No* | *Yes* | *Yes* | +| ODFE 1.0 to 1.12 | *Yes* | *Yes* | *No* | *Yes* | *Yes* | | ODFE 1.13 | *Yes* | *Yes* | *No* | *Yes* | *Yes* | -| OpenSearch 1.x | Yes via version setting | Yes via version setting | *No* | *Yes* | Yes, with Elastic Common Schema Setting | +| OpenSearch 1.x to 2.x | Yes via version setting | Yes via version setting | *No* | *Yes* | Yes, with Elastic Common Schema Setting | \* Most current compatible version with Elasticsearch OSS. ### Compatibility Matrix for Beats -| | Beats OSS 7.x to 7.11.x\*\* | Beats OSS 7.12.x\* | Beats 7.13.x | +| | Beats OSS 7.0.0 to 7.11.x\*\* | Beats OSS 7.12.x\* | Beats 7.13.x | | :--- | :--- | :--- | :--- | -| Elasticsearch OSS 7.x to 7.9.x | *Yes* | *Yes* | No | +| Elasticsearch OSS 7.0.0 to 7.9.x | *Yes* | *Yes* | No | | Elasticsearch OSS 7.10.2 | *Yes* | *Yes* | No | -| ODFE 1.x to 1.12 | *Yes* | *Yes* | No | +| ODFE 1.0 to 1.12 | *Yes* | *Yes* | No | | ODFE 1.13 | *Yes* | *Yes* | No | -| OpenSearch 1.x | Yes via version setting | Yes via version setting | No | -| Logstash OSS 7.x to 7.11.x | *Yes* | *Yes* | *Yes* | +| OpenSearch 1.x to 2.x | Yes via version setting | Yes via version setting | No | +| Logstash OSS 7.0.0 to 7.11.x | *Yes* | *Yes* | *Yes* | | Logstash OSS 7.12.x\* | *Yes* | *Yes* | *Yes* | | Logstash 7.13.x with OpenSearch output plugin | *Yes* | *Yes* | *Yes* | \* Most current compatible version with Elasticsearch OSS. \*\* Beats OSS includes all Apache 2.0 Beats agents (i.e. Filebeat, Metricbeat, Auditbeat, Heartbeat, Winlogbeat, Packetbeat). + +Beats versions newer than 7.12.x are not supported by OpenSearch. If you must update the Beats agent(s) in your environment to a newer version, you can work around the incompatibility by directing traffic from Beats to Logstash and using the Logstash Output plugin to ingest the data to OpenSearch. +{: .warning } + +## OpenSearch CLI + +The OpenSearch CLI command line interface (opensearch-cli) lets you manage your OpenSearch cluster from the command line and automate tasks. For more information on OpenSearch CLI, see [OpenSearch CLI]({{site.url}}{{site.baseurl}}/tools/cli/). + +## OpenSearch Kubernetes operator + +The OpenSearch Kubernetes (K8s) Operator is an open-source kubernetes operator that helps automate the deployment and provisioning of OpenSearch and OpenSearch Dashboards in a containerized environment. For information on how to use the K8s operator, see [OpenSearch Kubernetes operator]({{site.url}}{{site.baseurl}}/tools/k8s-operator/) \ No newline at end of file diff --git a/_tools/k8s-operator.md b/_tools/k8s-operator.md new file mode 100644 index 0000000000..3f9f8512f7 --- /dev/null +++ b/_tools/k8s-operator.md @@ -0,0 +1,147 @@ +--- +layout: default +title: OpenSearch Kubernetes Operator +nav_order: 80 +has_children: false +--- + +The OpenSearch Kubernetes Operator is an open-source kubernetes operator that helps automate the deployment and provisioning of OpenSearch and OpenSearch Dashboards in a containerized environment. The operator can manage multiple OpenSearch clusters that can be scaled up and down depending on your needs. + + +## Installation + +There are two ways to get started with the operator: + +- [Use a Helm chart](#use-a-helm-chart). +- [Use a local installation](#use-a-local-installation). + +### Use a Helm chart + +If you use Helm to manage your Kubernetes cluster, you can use the OpenSearch Kubernetes Operator's Cloud Native Computing Foundation (CNCF) project stored in Artifact Hub, a web-based application for finding, installing, and publishing CNCF packages. + +To begin, log in to your Kubernetes cluster and add the Helm repository (repo) from [Artifact Hub](https://opster.github.io/opensearch-Kubernetes-operator/). + +``` +helm repo add opensearch-operator https://opster.github.io/opensearch-k8s-operator/ +``` + +Make sure that the repo is included in your Kubernetes cluster. + +``` +helm repo list | grep opensearch +``` + +Both the `opensearch` and `opensearch-operator` repos appear in the list of repos. + + +Install the manager that operates all of the OpenSearch Kubernetes Operator's actions. + +``` +helm install opensearch-operator opensearch-operator/opensearch-operator +``` + +After the installation completes, the operator returns information on the deployment with `STATUS: deployed`. Then you can configure and start your [OpenSearch cluster](#deploy-a-new-opensearch-cluster). + +### Use a local installation + +If you want to create a new Kubernetes cluster on your existing machine, use a local installation. + +If this is your first time running Kubernetes and you intend to run through these instructions on your laptop, make sure that you have the following installed: + +- [Kubernetes](https://kubernetes.io/docs/tasks/tools/) +- [Docker](https://docs.docker.com/engine/install/) +- [minikube](https://minikube.sigs.k8s.io/docs/start/) + +Before running through the installation steps, make sure that you have a Kubernetes environment running locally. When using minikube, open a new terminal window and enter `minikube start`. Kubernetes will now use a containerized minikube cluster with a namespace called `default`. + +Then install the OpenSearch Kubernetes Operator using the following steps: + +1. In your preferred directory, clone the [OpenSearch Kubernetes Operator repo](https://github.com/Opster/opensearch-k8s-operator). Navigate into repo's directory using `cd`. +2. Go to the `opensearch-operator` folder. +3. Enter `make build manifests`. +4. Start a Kubernetes cluster. When using minikube, open a new terminal window and enter `minikube start`. Kubernetes will now use a containerized minikube cluster with a namespace called `default`. Make sure that `~/.kube/config` points to the cluster. + +```yml +apiVersion: v1 +clusters: +- cluster: + certificate-authority: /Users/naarcha/.minikube/ca.crt + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: cluster_info + server: https://127.0.0.1:61661 + name: minikube +contexts: +- context: + cluster: minikube + extensions: + - extension: + last-update: Mon, 29 Aug 2022 10:11:47 CDT + provider: minikube.sigs.k8s.io + version: v1.26.1 + name: context_info + namespace: default + user: minikube + name: minikube +current-context: minikube +kind: Config +preferences: {} +users: +- name: minikube + user: + client-certificate: /Users/naarcha/.minikube/profiles/minikube/client.crt + client-key: /Users/naarcha/.minikube/profiles/minikube/client.key +``` + +5. Enter `make install` to create the CustomResourceDefinition that runs in your Kubernetes cluster. +6. Start the OpenSearch Kubernetes Operator. Enter `make run`. + +## Verify Kubernetes deployment + +To ensure that Kubernetes recognizes the OpenSearch Kubernetes Operator as a namespace, enter `k get ns | grep opensearch`. Both `opensearch` and `opensearch-operator-system` should appear as `Active`. + +With the operator active, use `k get pod -n opensearch-operator-system` to make sure that the operator's pods are running. + +``` +NAME READY STATUS RESTARTS AGE +opensearch-operator-controller-manager- 2/2 Running 0 25m +``` + +With the Kubernetes cluster running, you can now run OpenSearch inside the cluster. + +## Deploy a new OpenSearch cluster + +From your cloned OpenSearch Kubernetes Operator repo, navigate to the `opensearch-operator/examples` directory. There you'll find the `opensearch-cluster.yaml` file, which can be customized to the needs of your cluster, including the `clusterName` that acts as the namespace in which your new OpenSearch cluster will reside. + +With your cluster configured, run the `kubectl apply` command. + +``` +kubectl apply -f opensearch-cluster.yaml +``` + +The operator creates several pods, including a bootstrap pod, three OpenSearch cluster pods, and one Dashboards pod. To connect to your cluster, use the `port-forward` command. + +``` +kubectl port-forward svc/my-cluster-dashboards 5601 +``` + +Open http://localhost:5601 in your preferred browser and log in with the default demo credentials `admin / admin`. You can also run curl commands against the OpenSearch REST API by forwarding to port 9200. + +``` +kubectl port-forward svc/my-cluster 9200 +``` + +In order to delete the OpenSearch cluster, delete the cluster resources. The following command deletes the cluster namespace and all its resources. + +``` +kubectl delete -f opensearch-cluster.yaml +``` + +## Next steps + +To learn more about how to customize your Kubernetes OpenSearch cluster, including data persistence, authentication methods, and scaling, see the [OpenSearch Kubernetes Operator User Guide](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/userguide/main.md). + +If you want to contribute to the development of the OpenSearch Kubernetes Operator, see the repo [design documents](https://github.com/Opster/opensearch-k8s-operator/blob/main/docs/designs/high-level.md). \ No newline at end of file diff --git a/_clients/logstash/advanced-config.md b/_tools/logstash/advanced-config.md similarity index 100% rename from _clients/logstash/advanced-config.md rename to _tools/logstash/advanced-config.md diff --git a/_clients/logstash/common-filters.md b/_tools/logstash/common-filters.md similarity index 100% rename from _clients/logstash/common-filters.md rename to _tools/logstash/common-filters.md diff --git a/_clients/logstash/execution-model.md b/_tools/logstash/execution-model.md similarity index 100% rename from _clients/logstash/execution-model.md rename to _tools/logstash/execution-model.md diff --git a/_clients/logstash/index.md b/_tools/logstash/index.md similarity index 99% rename from _clients/logstash/index.md rename to _tools/logstash/index.md index 2947ff7340..deb447045b 100644 --- a/_clients/logstash/index.md +++ b/_tools/logstash/index.md @@ -1,7 +1,7 @@ --- layout: default title: Logstash -nav_order: 200 +nav_order: 150 has_children: true has_toc: true redirect_from: diff --git a/_clients/logstash/read-from-opensearch.md b/_tools/logstash/read-from-opensearch.md similarity index 100% rename from _clients/logstash/read-from-opensearch.md rename to _tools/logstash/read-from-opensearch.md diff --git a/_clients/logstash/ship-to-opensearch.md b/_tools/logstash/ship-to-opensearch.md similarity index 86% rename from _clients/logstash/ship-to-opensearch.md rename to _tools/logstash/ship-to-opensearch.md index 050c8a4336..2728ee98dd 100644 --- a/_clients/logstash/ship-to-opensearch.md +++ b/_tools/logstash/ship-to-opensearch.md @@ -9,7 +9,7 @@ nav_order: 220 You can Ship Logstash events to an OpenSearch cluster and then visualize your events with OpenSearch Dashboards. -Make sure you have [Logstash]({{site.url}}{{site.baseurl}}/tools/logstash/index#install-logstash), [OpenSearch]({{site.url}}{{site.baseurl}}/opensearch/install/index/), and [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/install/index/). +Make sure you have [Logstash]({{site.url}}{{site.baseurl}}/tools/logstash/index#install-logstash), [OpenSearch]({{site.url}}{{site.baseurl}}/install-and-configure/install-opensearch/index/), and [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/index/). {: .note } ## OpenSearch output plugin @@ -117,7 +117,8 @@ output { type => 'aws_iam' aws_access_key_id => 'ACCESS_KEY' aws_secret_access_key => 'SECRET_KEY' - region => 'us-west-2' + region => 'us-west-2' + service_name => 'es' } index => "logstash-logs-%{+YYYY.MM.dd}" } @@ -142,8 +143,11 @@ output { - Environment variables - AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY (RECOMMENDED since they are recognized by all the AWS SDKs and CLI except for .NET), or AWS_ACCESS_KEY and AWS_SECRET_KEY (only recognized by Java SDK) - Credential profiles file at the default location (~/.aws/credentials) shared by all AWS SDKs and the AWS CLI - Instance profile credentials delivered through the Amazon EC2 metadata service -- template (path) - You can set the path to your own template here, if you so desire. If not set, the included template will be used. -- template_name (string, default => "logstash") - defines how the template is named inside Opensearch +- template (path) - You can set the path to your own template here. If no template is specified, the plugin uses the default template. +- template_name (string, default => "logstash") - Defines how the template is named inside Opensearch +- service_name (string, default => "es") - Defines the service name to be used for `aws_iam` authentication. +- legacy_template (boolean, default => true) - Selects the OpenSearch template API. When `true`, uses legacy templates via the _template API. When `false`, uses composable templates via the _index_template API. +- default_server_major_version (number) - The OpenSearch server major version to use when it's not available from the OpenSearch root URL. If not set, the plugin throws an exception when the version can't be fetched. ## Data streams From e6e713abfed50bcd88fa98e0c3b01655e63fdc1b Mon Sep 17 00:00:00 2001 From: Naarcha-AWS Date: Wed, 8 Mar 2023 15:03:09 -0600 Subject: [PATCH 2/3] Fix 1.3 TOC Signed-off-by: Naarcha-AWS --- _api-reference/nodes-apis/nodes-stats.md | 15 - _api-reference/snapshots/restore-snapshot.md | 3 +- _clients/index.md | 1 - .../get-started/quickstart-dashboards.md | 3 + _dashboards/search-telemetry.md | 5 - .../install-dashboards/index.md | 1 - _install-and-configure/plugins.md | 3 - .../upgrade-opensearch/index.md | 2 +- _monitoring-your-cluster/pa/index.md | 2 +- .../availability-and-recovery/remote.md | 234 +++++++++ .../segment-replication/configuration.md | 84 ---- .../segment-replication/index.md | 27 - .../shard-indexing-backpressure.md | 33 -- .../shard-indexing-settings.md | 52 -- .../snapshots/index.md | 30 ++ .../snapshots/searchable_snapshot.md | 125 +++++ .../snapshots/sm-api.md | 463 ++++++++++++++++++ .../snapshots/snapshot-management.md | 81 +++ .../snapshots/snapshot-restore.md | 1 + .../availability-and-recovery/stats-api.md | 12 +- .../cluster-manager-task-throttling.md | 107 ++++ _tuning-your-cluster/cluster.md | 39 +- .../replication-plugin/api.md | 394 +++++++++++++++ 23 files changed, 1480 insertions(+), 237 deletions(-) create mode 100644 _tuning-your-cluster/availability-and-recovery/remote.md delete mode 100644 _tuning-your-cluster/availability-and-recovery/segment-replication/configuration.md delete mode 100644 _tuning-your-cluster/availability-and-recovery/segment-replication/index.md delete mode 100644 _tuning-your-cluster/availability-and-recovery/shard-indexing-backpressure.md delete mode 100644 _tuning-your-cluster/availability-and-recovery/shard-indexing-settings.md create mode 100644 _tuning-your-cluster/availability-and-recovery/snapshots/index.md create mode 100644 _tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md create mode 100644 _tuning-your-cluster/availability-and-recovery/snapshots/sm-api.md create mode 100644 _tuning-your-cluster/availability-and-recovery/snapshots/snapshot-management.md create mode 100644 _tuning-your-cluster/cluster-manager-task-throttling.md create mode 100644 _tuning-your-cluster/replication-plugin/api.md diff --git a/_api-reference/nodes-apis/nodes-stats.md b/_api-reference/nodes-apis/nodes-stats.md index ca6f9797e9..8e2f1bb50c 100644 --- a/_api-reference/nodes-apis/nodes-stats.md +++ b/_api-reference/nodes-apis/nodes-stats.md @@ -620,8 +620,6 @@ http.total_opened | Integer | The total number of HTTP connections the node has [ingest](#ingest) | Object | Ingest statistics for the node. [adaptive_selection](#adaptive_selection) | Object | Statistics about adaptive selections for the node. [indexing_pressure](#indexing_pressure) | Object | Statistics related to the node's indexing pressure. -[shard_indexing_pressure](#shard_indexing_pressure) | Object | Statistics related to indexing pressure at the shard level. -[search_backpressure]({{site.url}}{{site.baseurl}}/opensearch/search-backpressure#search-backpressure-stats-api) | Object | Statistics related to search backpressure. ### `indices` @@ -962,19 +960,6 @@ memory.current.primary_in_bytes | Integer | The total memory consumed by indexin memory.current.replica_in_bytes | Integer | The total memory consumed by indexing requests in the replica stage, in bytes. memory.current.all_in_bytes | Integer | The total memory consumed by indexing requests in the coordinating, primary, or replica stages. -### `shard_indexing_pressure` - -The `shard_indexing_pressure` object contains the [shard indexing pressure]({{site.url}}{{site.baseurl}}/opensearch/shard-indexing-backpressure) statistics and has the following properties. - -Field | Field type | Description -:--- | :--- | :--- -[stats]({{site.url}}{{site.baseurl}}/opensearch/stats-api/) | Object | Statistics about shard indexing pressure. -total_rejections_breakup_shadow_mode | Object | If running in shadow mode, the `total_rejections_breakup_shadow_mode` object contains statistics about the request rejection criteria of all shards in the node. -total_rejections_breakup_shadow_mode.node_limits | Integer | The total number of rejections due to the node memory limit. When all shards reach the memory limit assigned to the node (for example, 10% of heap size), the shard is unable to take in more traffic on the node, and the indexing request is rejected. -total_rejections_breakup_shadow_mode.no_successful_request_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and the shard has multiple outstanding requests that are waiting to be executed. In this case, additional indexing requests are rejected until the system recovers. -total_rejections_breakup_shadow_mode.throughput_degradation_limits | Integer | The total number of rejections when the node occupancy level is breaching its soft limit and there is a constant deterioration in the request turnaround at the shard level. In this case, additional indexing requests are rejected until the system recovers. -enabled | Boolean | Specifies whether the shard indexing pressure feature is turned on for the node. -enforced | Boolean | If true, the shard indexing pressure runs in enforced mode (there are rejections). If false, the shard indexing pressure runs in shadow mode (there are no rejections, but statistics are recorded and can be retrieved in the `total_rejections_breakup_shadow_mode` object). Only applicable if shard indexing pressure is enabled. ## Required permissions diff --git a/_api-reference/snapshots/restore-snapshot.md b/_api-reference/snapshots/restore-snapshot.md index 1322933e73..2eb91d3a3b 100644 --- a/_api-reference/snapshots/restore-snapshot.md +++ b/_api-reference/snapshots/restore-snapshot.md @@ -10,9 +10,8 @@ nav_order: 9 Restores a snapshot of a cluster or specified data streams and indices. -* For information about indices and clusters, see [Introduction to OpenSearch]({{site.url}}{{site.baseurl}}/opensearch/index). +For information about indices and clusters, see [Introduction to OpenSearch]({{site.url}}{{site.baseurl}}/opensearch/index). -* For information about data streams, see [Data streams]({{site.url}}{{site.baseurl}}/opensearch/data-streams). If open indices with the same name that you want to restore already exist in the cluster, you must close, delete, or rename the indices. See [Sample Request](#example-request) for information about renaming an index. See [Close index]({{site.url}}{{site.baseurl}}/api-reference/index-apis/close-index) for information about closing an index. {: .note} diff --git a/_clients/index.md b/_clients/index.md index cb1ad66c99..cc9097dc1e 100644 --- a/_clients/index.md +++ b/_clients/index.md @@ -18,7 +18,6 @@ OpenSearch provides clients for the following programming languages and platform * **Python** * [OpenSearch high-level Python client]({{site.url}}{{site.baseurl}}/clients/python-high-level/) * [OpenSearch low-level Python client]({{site.url}}{{site.baseurl}}/clients/python-low-level/) - * [`opensearch-py-ml` client]({{site.url}}{{site.baseurl}}/clients/opensearch-py-ml/) * **Java** * [OpenSearch Java client]({{site.url}}{{site.baseurl}}/clients/java/) * **JavaScript** diff --git a/_dashboards/get-started/quickstart-dashboards.md b/_dashboards/get-started/quickstart-dashboards.md index 5cae8b05fa..51f46373d2 100644 --- a/_dashboards/get-started/quickstart-dashboards.md +++ b/_dashboards/get-started/quickstart-dashboards.md @@ -3,6 +3,9 @@ layout: default title: Quickstart guide for OpenSearch Dashboards nav_order: 20 has_children: false +redirect_from: + - /dashboards/index/ + - /dashboards/quickstart/ --- # Quickstart guide for OpenSearch Dashboards diff --git a/_dashboards/search-telemetry.md b/_dashboards/search-telemetry.md index 6cf6d73300..f255608e60 100644 --- a/_dashboards/search-telemetry.md +++ b/_dashboards/search-telemetry.md @@ -1,12 +1,7 @@ --- layout: default -<<<<<<< HEAD -title: Search telemetry -nav_order: 30 -======= title: Managing search telemetry settings nav_order: 100 ->>>>>>> 7ba0d6d1 (Updates title (#2762)) --- diff --git a/_install-and-configure/install-dashboards/index.md b/_install-and-configure/install-dashboards/index.md index c9cb4469a5..5abcb1cc4b 100644 --- a/_install-and-configure/install-dashboards/index.md +++ b/_install-and-configure/install-dashboards/index.md @@ -19,7 +19,6 @@ OpenSearch Dashboards has the following installation options: - [Docker]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/docker/) - [Tarball]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/tar/) - [RPM]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/rpm/) -- [Debian]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/debian/) - [Helm]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/helm/) - [Windows]({{site.url}}{{site.baseurl}}/install-and-configure/install-dashboards/windows/) diff --git a/_install-and-configure/plugins.md b/_install-and-configure/plugins.md index dabac1a0d6..48bce82ff9 100644 --- a/_install-and-configure/plugins.md +++ b/_install-and-configure/plugins.md @@ -292,7 +292,6 @@ Members of the OpenSearch community have built countless plugins for the service ## Related links - [About Observability]({{site.url}}{{site.baseurl}}/observability-plugin/index/) -- [About security analytics]({{site.url}}{{site.baseurl}}/security-analytics/index/) - [About the security plugin]({{site.url}}{{site.baseurl}}/security/index/) - [Alerting]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/index/) - [Anomaly detection]({{site.url}}{{site.baseurl}}/monitoring-plugins/ad/index/) @@ -301,8 +300,6 @@ Members of the OpenSearch community have built countless plugins for the service - [Index State Management]({{site.url}}{{site.baseurl}}/im-plugin/ism/index/) - [k-NN]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) - [ML Commons Plugin]({{site.url}}{{site.baseurl}}/ml-commons-plugin/index/) -- [Neural Search]({{site.url}}{{site.baseurl}}/neural-search-plugin/index/) -- [Notifications]({{site.url}}{{site.baseurl}}/notifications-plugin/index/) - [OpenSearch Dashboards]({{site.url}}{{site.baseurl}}/dashboards/index/) - [Performance Analyzer]({{site.url}}{{site.baseurl}}/monitoring-plugins/pa/index/) - [SQL]({{site.url}}{{site.baseurl}}/search-plugins/sql/index/) diff --git a/_install-and-configure/upgrade-opensearch/index.md b/_install-and-configure/upgrade-opensearch/index.md index f41aaf12dd..917cd34885 100644 --- a/_install-and-configure/upgrade-opensearch/index.md +++ b/_install-and-configure/upgrade-opensearch/index.md @@ -69,7 +69,7 @@ Choose an appropriate method for upgrading your cluster to a new version of Open - A [rolling upgrade](#rolling-upgrade) upgrades nodes one at a time without stopping the cluster. - A [cluster restart upgrade](#cluster-restart-upgrade) upgrades services while the cluster is stopped. -Upgrades spanning more than a single major version of OpenSearch will require additional effort due to the need for reindexing. For more information, refer to the [Reindex]({{site.url}}{{site.baseurl}}/api-reference/document-apis/reindex/) API. See the [Lucene version reference](#lucene-version-reference) table included later in this guide for help planning your data migration. +Upgrades spanning more than a single major version of OpenSearch will require additional effort due to the need for reindexing. For more information, refer to the [Reindex]({{site.url}}{{site.baseurl}}/api-reference/document-apis/reindex/) API. See the [Index compatibility reference](#index-compatibility-reference) table included later in this guide for help planning your data migration. ### Rolling upgrade diff --git a/_monitoring-your-cluster/pa/index.md b/_monitoring-your-cluster/pa/index.md index 6b5a19c147..f0a7c2e4d4 100644 --- a/_monitoring-your-cluster/pa/index.md +++ b/_monitoring-your-cluster/pa/index.md @@ -22,7 +22,7 @@ npm install -g @aws/opensearch-perftop ![PerfTop screenshot]({{site.url}}{{site.baseurl}}/images/perftop.png) -For enabling Performance Analyzer with tarball installations of OpenSearch, see [Configure Performance Analyzer for Tarball Installation](#configure-performance-analyzer-for-tarball-installation). +For enabling Performance Analyzer with tarball installations of OpenSearch, see [Configure Performance Analyzer for Tarball Installation](#configure-performance-analyzer-for-tarball-installations). ## Get started with PerfTop diff --git a/_tuning-your-cluster/availability-and-recovery/remote.md b/_tuning-your-cluster/availability-and-recovery/remote.md new file mode 100644 index 0000000000..99f5e787cf --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/remote.md @@ -0,0 +1,234 @@ +--- +layout: default +title: Remote-backed storage +nav_order: 40 +parent: Availability and Recovery +redirect_from: + - /opensearch/remote/ +--- + +# Remote-backed storage + +Remote-backed storage is an experimental feature. Therefore, we do not recommend the use of remote-backed storage in a production environment. For updates on the progress of remote-backed storage, or if you want leave feedback that could help improve the feature, refer to the issue on [GitHub](https://github.com/opensearch-project/OpenSearch/issues/1968). +{: .warning} + +Remote-backed storage offers OpenSearch users a new way to protect against data loss by automatically creating backups of all index transactions and sending them to remote storage. In order to expose this feature, segment replication must also be enabled. See [Segment replication]({{site.url}}{{site.baseurl}}/opensearch/segment-replication/) for additional information. + +## Translog + +Any index changes, such as indexing or deleting documents, are written to disk during a Lucene commit. However, Lucene commits are expensive operations, so they cannot be performed after every change to the index. Instead, each shard records every indexing operation in a transaction log called *translog*. When a document is indexed, it is added to the memory buffer and recorded in the translog. Frequent refresh operations write the documents in the memory buffer to a segment and then clear the memory buffer. Periodically, a flush performs a Lucene commit, which includes writing the segments to disk using fsync, purging the old translog, and starting a new translog. Thus, a translog contains all operations that have not yet been flushed. + +## Segment replication and remote-backed storage + +When neither segment replication nor remote-backed storage is enabled, OpenSearch uses document replication. In document replication, when a write request lands on the primary shard, the request is indexed to Lucene and stored in the translog. After this, the request is sent to the replicas, where, in turn, it is indexed to Lucene and stored in the translog for durability. + +With segment replication, segments are created on the primary shard only and then copied to all replicas. The replicas do not index requests to Lucene, but they do create and maintain a translog. + +With remote-backed storage, when a write request lands on the primary shard, the request is indexed to Lucene on the primary shard only. The corresponding translog is then uploaded to remote store. OpenSearch does not send the write request to the replicas, but rather performs a primary term validation to confirm that the request originator shard is still the primary shard. Primary term validation ensures that the acting primary shard fails if it becomes isolated and is unaware of the cluster manager electing a new primary. + +## The `index.translog.durability` translog setting + +Without remote-backed storage, indexing operations are only persisted to disk when the translog is fsynced. Therefore, any data that has not been written to disk can potentially be lost. + +The `index.translog.durability` setting controls how frequently OpenSearch fsyncs the translog to disk: + +- By default, `index.translog.durability` is set to `request`. This means that fsync happens after every request, and all acknowledged write requests persist in case of failure. + +- If you set `index.translog.durability` to `async`, fsync happens periodically at the specified `sync_interval` (5 seconds by default). The fsync operation is asynchronous, so acknowledge is sent without waiting for fsync. Consequently, all acknowledged writes since the last commit are lost in case of failure. + +With remote-backed storage, the translog is uploaded to a remote store for durability. + +`index.translog.durability` is a dynamic setting. To update it, use the following query: + +```json +PUT my_index/_settings +{ + "index" : { + "translog.durability" : "request" + } +} +``` + +## Refresh-level and request-level durability + +The remote store feature supports two levels of durability: + +- Refresh-level durability: Segment files are uploaded to remote store after every refresh. Set the `remote_store` flag to `true` to achieve refresh-level durability. Commit-level durability is inherent, and uploads are asynchronous. + + If you need to refresh an index manually, you can use the `_refresh` API. For example, to refresh the `my_index` index, use the following request: + + ```json + POST my_index/_refresh + ``` + +- Request-level durability: Translogs are uploaded before acknowledging the request. Set the `translog` flag to `true` to achieve request-level durability. In this scenario, we recommend to batch as many requests as possible in a bulk request. Batching requests will improve indexing throughput and latency compared to sending individual write requests. + +## Enable the feature flag + +There are several methods for enabling remote store feature, depending on the install type. You will also need to enable `remote_store` property when creating the index. + +Segment replication must also be enabled to use remote-backed storage. +{: .note} + +### Enable on a node using a tarball install + +The flag is toggled using a new jvm parameter that is set either in `OPENSEARCH_JAVA_OPTS` or in config/jvm.options. + +#### Option 1: Modify jvm.options + +Add the following lines to `config/jvm.options` before starting the OpenSearch process to enable the feature and its dependency: + +``` +-Dopensearch.experimental.feature.replication_type.enabled=true +-Dopensearch.experimental.feature.remote_store.enabled=true +``` + +Run OpenSearch + +```bash +./bin/opensearch +``` + +#### Option 2: Enable from an environment variable + +As an alternative to directly modifying `config/jvm.options`, you can define the properties by using an environment variable. This can be done in a single command when you start OpenSearch or by defining the variable with `export`. + +To add these flags in-line when starting OpenSearch: + +```bash +OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true -Dopensearch.experimental.feature.remote_store.enabled=true" ./opensearch-{{site.opensearch_version}}/bin/opensearch +``` + +If you want to define the environment variable separately, prior to running OpenSearch: + +```bash +export OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true -Dopensearch.experimental.feature.remote_store.enabled=true" +./bin/opensearch +``` + +### Enable with Docker containers + +If you're running Docker, add the following line to docker-compose.yml underneath the `opensearch-node` and `environment` section: + +````json +OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true -Dopensearch.experimental.feature.remote_store.enabled=true" +```` + +### Enable for OpenSearch development + +To create new indexes with remote-backed storage enabled, you must first enable these features by adding the correct properties to `run.gradle` before building OpenSearch. See the [developer guide](https://github.com/opensearch-project/OpenSearch/blob/main/DEVELOPER_GUIDE.md) for information about to use how Gradle to build OpenSearch. + +Add the following properties to `run.gradle` to enable the feature: + +```bash +testClusters { + runTask { + testDistribution = 'archive' + if (numZones > 1) numberOfZones = numZones + if (numNodes > 1) numberOfNodes = numNodes + systemProperty 'opensearch.experimental.feature.replication_type.enabled', 'true' + systemProperty 'opensearch.experimental.feature.remote_store.enabled', 'true' + } +} +``` + +## Register a remote repository + +Now that your deployment is running with the feature flags enabled, the next step is to register a remote repository where backups will be stored. See [Register repository]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#register-repository) for more information. + +## Create an index + +Remote-backed storage is enabled for an index when it is created. This feature cannot be enabled for indexes that already exist. + +For refresh-level durability, include the `remote_store` property to enable the feature and specify a segment repository: + +```bash +curl -X PUT "https://localhost:9200/my-index?pretty" -ku admin:admin -H 'Content-Type: application/json' -d' +{ + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0, + "replication": { + "type": "SEGMENT" + }, + "remote_store": { + "enabled": true, + "repository": "segment-repo" + } + } + } +} +' +``` + +For request-level durability, in addition to the `remote_store` and segment repository, include the `translog` property and specify a translog repository: + +```bash +curl -X PUT "https://localhost:9200/my-index?pretty" -ku admin:admin -H 'Content-Type: application/json' -d' +{ + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 1, + "replication": { + "type": "SEGMENT" + }, + "remote_store": { + "enabled": true, + "repository": "segment-repo", + "translog": { + "enabled": true, + "repository": "translog-repo", + "buffer_interval": "300ms" + } + } + } + } +} +' +``` + +You can have the same repository serve as both the segment repository and translog repository. +{: .note} + +As data is added to the index, it also will be continuously uploaded to remote storage in the form of segment and translog files because of refreshes, flushes, and translog fsyncs to disk. Along with data, other metadata files will be uploaded. +The `buffer_interval` setting specifies the time interval during which translog operations are buffered. Instead of uploading individual translog files, OpenSearch creates a single translog file with all the write operations received during the configured interval. Bundling translog files leads to higher throughput but also increases latency. The default `buffer_interval` value is 100 ms. + +Setting `translog.enabled` to `true` is currently an irreversible operation. +{: .warning} + +### Restoring from a backup + +To restore an index from a remote backup, such as in the event of a node failure, you must first close the index: + +```bash +curl -X POST "https://localhost:9200/my-index/_close" -ku admin:admin +``` + +Restore the index from the backup stored on the remote repository: + +```bash +curl -X POST "https://localhost:9200/_remotestore/_restore" -ku admin:admin -H 'Content-Type: application/json' -d' +{ + "indices": ["my-index"] +} +' +``` + +If the security plugin is enabled, a user must have the `cluster:admin/remotestore/restore` permission. See [Access control](/security-plugin/access-control/index/) for information about configuring user permissions. +{: .note} + +## Potential use cases + +You can use remote-backed storage for the following purposes: + +- To restore red clusters or indexes +- To recover all data up to the last acknowledged write, regardless of replica count, if `index.translog.durability` is set to `request` + +## Known limitations + +The following are known limitations of the remote-backed storage feature: + +- Writing data to a remote store can be a high-latency operation when compared to writing data on the local file system. This may impact the indexing throughput and latency. For performance benchmarking results, see [issue #6376](https://github.com/opensearch-project/OpenSearch/issues/6376). + diff --git a/_tuning-your-cluster/availability-and-recovery/segment-replication/configuration.md b/_tuning-your-cluster/availability-and-recovery/segment-replication/configuration.md deleted file mode 100644 index b336df6985..0000000000 --- a/_tuning-your-cluster/availability-and-recovery/segment-replication/configuration.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -layout: default -title: Segment replication configuration -nav_order: 12 -parent: Segment replication -grand_parent: Availability and Recovery ---- - -# Segment replication configuration - -Segment replication is an experimental feature. Therefore, we do not recommend the use of segment replication in a production environment. For updates on the progress of segment replication or if you want to leave feedback that could help improve the feature, see the [Segment replication issue](https://github.com/opensearch-project/OpenSearch/issues/2194). -{: .warning } - -To enable the segment replication type, reference the steps below. - -## Enabling the feature flag - -There are several methods for enabling segment replication, depending on the install type. You will also need to set the replication strategy to `SEGMENT` when creating the index. - -### Enable on a node using a tarball install - -The flag is toggled using a new jvm parameter that is set either in `OPENSEARCH_JAVA_OPTS` or in config/jvm.options. - -1. Option 1: Update config/jvm.options by adding the following line: - - ````json - -Dopensearch.experimental.feature.replication_type.enabled=true - ```` - -1. Option 2: Use the `OPENSEARCH_JAVA_OPTS` environment variable: - - ````json - export OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true" - ```` -1. Option 3: For developers using Gradle, update run.gradle by adding the following lines: - - ````json - testClusters { - runTask { - testDistribution = 'archive' - if (numZones > 1) numberOfZones = numZones - if (numNodes > 1) numberOfNodes = numNodes - systemProperty 'opensearch.experimental.feature.replication_type.enabled', 'true' - } - } - ```` - -### Enable with Docker containers - -If you're running Docker, add the following line to docker-compose.yml underneath the `opensearch-node` and `environment` section: - -````json -OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true" # Enables segment replication -```` - -### Setting the replication strategy on the index - -To set the replication strategy to segment replication, create an index with replication.type set to `SEGMENT`: - -````json -PUT /my-index1 -{ - "settings": { - "index": { - "replication.type": "SEGMENT" - } - } -} -```` - -## Known limitations - -1. Enabling segment replication for an existing index requires [reindexing](https://github.com/opensearch-project/OpenSearch/issues/3685). -1. Rolling upgrades are currently not supported. Full cluster restarts are required when upgrading indexes using segment replication. [Issue 3881](https://github.com/opensearch-project/OpenSearch/issues/3881). -1. [Cross-cluster replication](https://github.com/opensearch-project/OpenSearch/issues/4090) does not currently use segment replication to copy between clusters. -1. Increased network congestion on primary shards. [Issue - Optimize network bandwidth on primary shards](https://github.com/opensearch-project/OpenSearch/issues/4245). -1. Shard allocation algorithms have not been updated to evenly spread primary shards across nodes. -1. Integration with remote-backed storage as the source of replication is [currently unsupported](https://github.com/opensearch-project/OpenSearch/issues/4448). - -### Further resources regarding segment replication - -1. [Known issues](https://github.com/opensearch-project/OpenSearch/issues/2194). -1. Steps for testing (link coming soon). -1. Segment replication blog post (link coming soon). \ No newline at end of file diff --git a/_tuning-your-cluster/availability-and-recovery/segment-replication/index.md b/_tuning-your-cluster/availability-and-recovery/segment-replication/index.md deleted file mode 100644 index b7641f8192..0000000000 --- a/_tuning-your-cluster/availability-and-recovery/segment-replication/index.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -layout: default -title: Segment replication -nav_order: 70 -has_children: true -parent: Availability and Recovery -redirect_from: - - /opensearch/segment-replication/ - - /opensearch/segment-replication/index/ ---- - -# Segment replication - -Segment replication is an experimental feature with OpenSearch 2.3. Therefore, we do not recommend the use of segment replication in a production environment. For updates on the progress of segment replication or if you want leave feedback that could help improve the feature, see the [Segment replication git issue](https://github.com/opensearch-project/OpenSearch/issues/2194). -{: .warning} - -With segment replication, segment files are copied across shards instead of documents being indexed on each shard copy. This improves indexing throughput and lowers resource utilization at the expense of increased network utilization. - -As an experimental feature, segment replication will be behind a feature flag and must be enabled on **each node** of a cluster and pass a new setting during index creation. -{: .note } - -### Potential use cases - -- Users who have high write loads but do not have high search requirements and are comfortable with longer refresh times. -- Users with very high loads who want to add new nodes, as you do not need to index all nodes when adding a new node to the cluster. - -This is the first step in a series of features designed to decouple reads and writes in order to lower compute costs. \ No newline at end of file diff --git a/_tuning-your-cluster/availability-and-recovery/shard-indexing-backpressure.md b/_tuning-your-cluster/availability-and-recovery/shard-indexing-backpressure.md deleted file mode 100644 index cde2f125cb..0000000000 --- a/_tuning-your-cluster/availability-and-recovery/shard-indexing-backpressure.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -layout: default -title: Shard indexing backpressure -nav_order: 62 -has_children: true -parent: Availability and Recovery -redirect_from: - - /opensearch/shard-indexing-backpressure/ ---- - -# Shard indexing backpressure - -Shard indexing backpressure is a smart rejection mechanism at a per-shard level that dynamically rejects indexing requests when your cluster is under strain. It propagates a backpressure that transfers requests from an overwhelmed node or shard to other nodes or shards that are still healthy. - -With shard indexing backpressure, you can prevent nodes in your cluster from running into cascading failures due to performance degradation caused by slow nodes, stuck tasks, resource-intensive requests, traffic surges, skewed shard allocations, and so on. - -Shard indexing backpressure comes into effect only when one primary and one secondary parameter is breached. - -## Primary parameters - -Primary parameters are early indicators that a cluster is under strain: - -- Shard memory limit breach: If the memory usage of a shard exceeds 95% of its allocated memory, this limit is breached. -- Node memory limit breach: If the memory usage of a node exceeds 70% of its allocated memory, this limit is breached. - -The breach of primary parameters doesn’t cause any actual request rejections, it just triggers an evaluation of the secondary parameters. - -## Secondary parameters - -Secondary parameters check the performance at the shard level to confirm that the cluster is under strain: - -- Throughput: If the throughput at the shard level decreases significantly in its historic view, this limit is breached. -- Successful Request: If the number of pending requests increases significantly in its historic view, this limit is breached. diff --git a/_tuning-your-cluster/availability-and-recovery/shard-indexing-settings.md b/_tuning-your-cluster/availability-and-recovery/shard-indexing-settings.md deleted file mode 100644 index 88b0ea70b4..0000000000 --- a/_tuning-your-cluster/availability-and-recovery/shard-indexing-settings.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -layout: default -title: Settings -parent: Shard indexing backpressure -nav_order: 50 -grand_parent: Availability and Recovery -redirect_from: - - /opensearch/shard-indexing-settings/ ---- - -# Settings - -Shard indexing backpressure adds several settings to the standard OpenSearch cluster settings. They are dynamic, so you can change the default behavior of this feature without restarting your cluster. - -## High-level controls - -The high-level controls allow you to turn the shard indexing backpressure feature on or off. - -Setting | Default | Description -:--- | :--- | :--- -`shard_indexing_pressure.enabled` | False | Change to `true` to enable shard indexing backpressure. -`shard_indexing_pressure.enforced` | False | Run shard indexing backpressure in shadow mode or enforced mode. In shadow mode (value set as `false`), shard indexing backpressure tracks all granular-level metrics, but it doesn't actually reject any indexing requests. In enforced mode (value set as `true`), shard indexing backpressure rejects any requests to the cluster that might cause a dip in its performance. - -## Node-level limits - -Node-level limits allow you to control memory usage on a node. - -Setting | Default | Description -:--- | :--- | :--- -`shard_indexing_pressure.primary_parameter.node.soft_limit` | 70% | Define the percentage of the node-level memory threshold that acts as a soft indicator for strain on a node. - -## Shard-level limits - -Shard-level limits allow you to control memory usage on a shard. - -Setting | Default | Description -:--- | :--- | :--- -`shard_indexing_pressure.primary_parameter.shard.min_limit` | 0.001d | Specify the minimum assigned quota for a new shard in any role (coordinator, primary, or replica). Shard indexing backpressure increases or decreases this allocated quota based on the inflow of traffic for the shard. -`shard_indexing_pressure.operating_factor.lower` | 75% | Specify the lower occupancy limit of the allocated quota of memory for the shard. If the total memory usage of a shard is below this limit, shard indexing backpressure decreases the current allocated memory for that shard. -`shard_indexing_pressure.operating_factor.optimal` | 85% | Specify the optimal occupancy of the allocated quota of memory for the shard. If the total memory usage of a shard is at this level, shard indexing backpressure doesn't change the current allocated memory for that shard. -`shard_indexing_pressure.operating_factor.upper` | 95% | Specify the upper occupancy limit of the allocated quota of memory for the shard. If the total memory usage of a shard is above this limit, shard indexing backpressure increases the current allocated memory for that shard. - -## Performance degradation factors - -The performance degradation factors allow you to control the dynamic performance thresholds for a shard. - -Setting | Default | Description -:--- | :--- | :--- -`shard_indexing_pressure.secondary_parameter.throughput.request_size_window` | 2,000 | The number of requests in the sampling window size on a shard. Shard indexing backpressure compares the overall performance of requests with the requests in the sample window to detect any performance degradation. -`shard_indexing_pressure.secondary_parameter.throughput.degradation_factor` | 5x | The degradation factor per unit byte for a request. This parameter determines the threshold for any latency spikes. The default value is 5x, which implies that if the latency shoots up 5 times in the historic view, shard indexing backpressure marks it as a performance degradation. -`shard_indexing_pressure.secondary_parameter.successful_request.elapsed_timeout` | 300000 ms | The amount of time a request is pending in a cluster. This parameter helps identify any stuck-request scenarios. -`shard_indexing_pressure.secondary_parameter.successful_request.max_outstanding_requests` | 100 | The maximum number of pending requests in a cluster. diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/index.md b/_tuning-your-cluster/availability-and-recovery/snapshots/index.md new file mode 100644 index 0000000000..3fde2804b7 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/index.md @@ -0,0 +1,30 @@ +--- +layout: default +title: Snapshots +nav_order: 5 +has_children: true +parent: Availability and Recovery +redirect_from: + - /opensearch/snapshots/ + - /opensearch/snapshots/index/ +has_toc: false +--- + +# Snapshots + +Snapshots are backups of a cluster's indexes and state. State includes cluster settings, node information, index metadata (mappings, settings, templates, etc.), and shard allocation. + +Snapshots have two main uses: + +- **Recovering from failure** + + For example, if cluster health goes red, you might restore the red indexes from a snapshot. + +- **Migrating from one cluster to another** + + For example, if you're moving from a proof-of-concept to a production cluster, you might take a snapshot of the former and restore it on the latter. + + +You can take and restore snapshots using the [snapshot API]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore). + +If you need to automate taking snapshots, you can use the [snapshot management]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-management) feature. diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md new file mode 100644 index 0000000000..f7ef3c981d --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/searchable_snapshot.md @@ -0,0 +1,125 @@ +--- +layout: default +title: Searchable snapshots +parent: Snapshots +nav_order: 40 +grand_parent: Availability and Recovery +redirect_from: + - /opensearch/snapshots/searchable_snapshot/ +--- + +# Searchable snapshots + +Searchable snapshots is an experimental feature released in OpenSearch 2.4. Therefore, we do not recommend the use of this feature in a production environment. For updates on progress, follow us on [GitHub](https://github.com/opensearch-project/OpenSearch/issues/3739). If you have any feedback please [submit a new issue](https://github.com/opensearch-project/OpenSearch/issues/new/choose). +{: .warning } + +A searchable snapshot is an index where data is read from a [snapshot repository]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore/#register-repository) on demand at search time rather than all index data being downloaded to cluster storage at restore time. Because the index data remains in the snapshot format in the repository, searchable snapshot indexes are inherently read-only. Any attempt to write to a searchable snapshot index will result in an error. + +To enable the searchable snapshots feature, reference the following steps. + +## Enabling the feature flag + +There are several methods for enabling searchable snapshots, depending on the installation type. + +### Enable on a node using a tarball installation + +The flag is toggled using a new jvm parameter that is set either in `OPENSEARCH_JAVA_OPTS` or in config/jvm.options: + +- Option 1: Update config/jvm.options by adding the following line: + + ```json + -Dopensearch.experimental.feature.searchable_snapshot.enabled=true + ``` + +- Option 2: Use the `OPENSEARCH_JAVA_OPTS` environment variable: + + ```json + export OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.searchable_snapshot.enabled=true" + ``` +- Option 3: For developers using Gradle, update run.gradle by adding the following lines: + + ```json + testClusters { + runTask { + testDistribution = 'archive' + if (numZones > 1) numberOfZones = numZones + if (numNodes > 1) numberOfNodes = numNodes + systemProperty 'opensearch.experimental.feature.searchable_snapshot.enabled', 'true' + } + } + ``` + +- Finally, create a node in your opensearch.yml file and define the node role as `search`: + + ```bash + node.name: snapshots-node + node.roles: [ search ] + ``` + +### Enable with Docker containers + +If you're running Docker, add the following line to docker-compose.yml underneath the `opensearch-node` and `environment` sections: + +```json +OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.searchable_snapshot.enabled=true" # Enables searchable snapshot +``` + +To create a node with the `search` node role, add the line `- node.roles: [ search ]` to your docker-compose.yml file: + +```bash +version: '3' +services: + opensearch-node1: + image: opensearchproject/opensearch:2.4.0 + container_name: opensearch-node1 + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-node1 + - node.roles: [ search ] +``` + +## Create a searchable snapshot index + +A searchable snapshot index is created by specifying the `remote_snapshot` storage type using the [restore snapshots API]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore/#restore-snapshots). + +Request Field | Description +:--- | :--- +`storage_type` | `local` indicates that all snapshot metadata and index data will be downloaded to local storage.

`remote_snapshot` indicates that snapshot metadata will be downloaded to the cluster, but the remote repository will remain the authoritative store of the index data. Data will be downloaded and cached as necessary to service queries. At least one node in the cluster must be configured with the `search` node role in order to restore a snapshot using the `remote_snapshot` type.

Defaults to `local`. + +## Listing indexes + +To determine whether an index is a searchable snapshot index, look for a store type with the value of `remote_snapshot`: + +``` +GET /my-index/_settings?pretty +``` + +```json +{ + "my-index": { + "settings": { + "index": { + "store": { + "type": "remote_snapshot" + } + } + } + } +} +``` + +## Potential use cases + +The following are potential use cases for the searchable snapshots feature: + +- The ability to offload indexes from cluster-based storage but retain the ability to search them. +- The ability to have a large number of searchable indexes in lower-cost media. + +## Known limitations + +The following are known limitations of the searchable snapshots feature: + +- Accessing data from a remote repository is slower than local disk reads, so higher latencies on search queries are expected. +- Data is discarded immediately after being read. Subsequent searches for the same data will have to be downloaded again. This will be addressed in the future by implementing a disk-based cache for storing frequently accessed data. +- Many remote object stores charge on a per-request basis for retrieval, so users should closely monitor any costs incurred. +- Searching remote data can impact the performance of other queries running on the same node. We recommend that users provision dedicated nodes with the `search` role for performance-critical applications. \ No newline at end of file diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/sm-api.md b/_tuning-your-cluster/availability-and-recovery/snapshots/sm-api.md new file mode 100644 index 0000000000..c664b39ad6 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/sm-api.md @@ -0,0 +1,463 @@ +--- +layout: default +title: Snapshot management API +parent: Snapshots +nav_order: 30 +has_children: false +grand_parent: Availability and Recovery +redirect_from: + - /opensearch/snapshots/sm-api/ +--- + +# Snapshot Management API + +Use the snapshot management (SM) API to automate [taking snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#take-snapshots). + +--- + +#### Table of contents +- TOC +{:toc} + + +--- + +## Create or update a policy +Introduced 2.1 +{: .label .label-purple } + +Creates or updates an SM policy. + +#### Request + +Create: + +```json +POST _plugins/_sm/policies/ +``` + +Update: + +```json +PUT _plugins/_sm/policies/?if_seq_no=0&if_primary_term=1 +``` + +You must provide the `seq_no` and `primary_term` parameters for an update request. + +### Example + +```json +POST _plugins/_sm/policies/daily-policy +{ + "description": "Daily snapshot policy", + "creation": { + "schedule": { + "cron": { + "expression": "0 8 * * *", + "timezone": "UTC" + } + }, + "time_limit": "1h" + }, + "deletion": { + "schedule": { + "cron": { + "expression": "0 1 * * *", + "timezone": "America/Los_Angeles" + } + }, + "condition": { + "max_age": "7d", + "max_count": 21, + "min_count": 7 + }, + "time_limit": "1h" + }, + "snapshot_config": { + "date_format": "yyyy-MM-dd-HH:mm", + "timezone": "America/Los_Angeles", + "indices": "*", + "repository": "s3-repo", + "ignore_unavailable": "true", + "include_global_state": "false", + "partial": "true", + "metadata": { + "any_key": "any_value" + } + }, + "notification": { + "channel": { + "id": "NC3OpoEBzEoHMX183R3f" + }, + "conditions": { + "creation": true, + "deletion": false, + "failure": false, + "time_limit_exceeded": false + } + } +} +``` + +### Response + +```json +{ + "_id" : "daily-policy-sm-policy", + "_version" : 5, + "_seq_no" : 54983, + "_primary_term" : 21, + "sm_policy" : { + "name" : "daily-policy", + "description" : "Daily snapshot policy", + "schema_version" : 15, + "creation" : { + "schedule" : { + "cron" : { + "expression" : "0 8 * * *", + "timezone" : "UTC" + } + }, + "time_limit" : "1h" + }, + "deletion" : { + "schedule" : { + "cron" : { + "expression" : "0 1 * * *", + "timezone" : "America/Los_Angeles" + } + }, + "condition" : { + "max_age" : "7d", + "min_count" : 7, + "max_count" : 21 + }, + "time_limit" : "1h" + }, + "snapshot_config" : { + "indices" : "*", + "metadata" : { + "any_key" : "any_value" + }, + "ignore_unavailable" : "true", + "timezone" : "America/Los_Angeles", + "include_global_state" : "false", + "date_format" : "yyyy-MM-dd-HH:mm", + "repository" : "s3-repo", + "partial" : "true" + }, + "schedule" : { + "interval" : { + "start_time" : 1656425122909, + "period" : 1, + "unit" : "Minutes" + } + }, + "enabled" : true, + "last_updated_time" : 1656425122909, + "enabled_time" : 1656425122909, + "notification" : { + "channel" : { + "id" : "NC3OpoEBzEoHMX183R3f" + }, + "conditions" : { + "creation" : true, + "deletion" : false, + "failure" : false, + "time_limit_exceeded" : false + } + } + } +} +``` + +### Parameters + +You can specify the following parameters to create/update an SM policy. + +Parameter | Type | Description +:--- | :--- |:--- |:--- | +`description` | String | The description of the SM policy. Optional. +`enabled` | Boolean | Should this SM policy be enabled at creation? Optional. +`snapshot_config` | Object | The configuration options for snapshot creation. Required. +`snapshot_config.date_format` | String | Snapshot names have the format `--`. `date_format` specifies the format for the date in the snapshot name. Supports all date formats supported by OpenSearch. Optional. Default is "yyyy-MM-dd'T'HH:mm:ss". +`snapshot_config.date_format_timezone` | String | Snapshot names have the format `--`. `date_format_timezone` specifies the time zone for the date in the snapshot name. Optional. Default is UTC. +`snapshot_config.indices` | String | The names of the indexes in the snapshot. Multiple index names are separated by `,`. Supports wildcards (`*`). Optional. Default is `*` (all indexes). +`snapshot_config.repository` | String | The repository in which to store snapshots. Required. +`snapshot_config.ignore_unavailable` | Boolean | Do you want to ignore unavailable indexes? Optional. Default is `false`. +`snapshot_config.include_global_state` | Boolean | Do you want to include cluster state? Optional. Default is `true` because of [Security plugin considerations]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore#security-considerations). +`snapshot_config.partial` | Boolean | Do you want to allow partial snapshots? Optional. Default is `false`. +`snapshot_config.metadata` | Object | Metadata in the form of key/value pairs. Optional. +`creation` | Object | Configuration for snapshot creation. Required. +`creation.schedule` | String | The cron schedule used to create snapshots. Required. +`creation.time_limit` | String | Sets the maximum time to wait for snapshot creation to finish. If time_limit is longer than the scheduled time interval for taking snapshots, no scheduled snapshots are taken until time_limit elapses. For example, if time_limit is set to 35 minutes and snapshots are taken every 30 minutes starting at midnight, the snapshots at 00:00 and 01:00 are taken, but the snapshot at 00:30 is skipped. Optional. +`deletion` | Object | Configuration for snapshot deletion. Optional. Default is to retain all snapshots. +`deletion.schedule` | String | The cron schedule used to delete snapshots. Optional. Default is to use `creation.schedule`, which is required. +`deletion.time_limit` | String | Sets the maximum time to wait for snapshot deletion to finish. Optional. +`deletion.delete_condition` | Object | Conditions for snapshot deletion. Optional. +`deletion.delete_condition.max_count` | Integer | The maximum number of snapshots to be retained. Optional. +`deletion.delete_condition.max_age` | String | The maximum time a snapshot is retained. Optional. +`deletion.delete_condition.min_count` | Integer | The minimum number of snapshots to be retained. Optional. Default is one. +`notification` | Object | Defines notifications for SM events. Optional. +`notification.channel` | Object | Defines a channel for notifications. Required. +`notification.channel.id` | String | The channel ID of the channel used for notifications. To get the channel IDs of all created channels, use `GET _plugins/_notifications/configs`. Required. +`notification.conditions` | Object | SM events you want to be notified about. Set the ones you are interested in to `true`. +`notification.conditions.creation` | Boolean | Do you want notifications about snapshot creation? Optional. Default is `true`. +`notification.conditions.deletion` | Boolean | Do you want notifications about snapshot deletion? Optional. Default is `false`. +`notification.conditions.failure` | Boolean | Do you want notifications about creation or deletion failure? Optional. Default is `false`. +`notification.conditions.time_limit_exceeded` | Boolean | Do you want notifications when snapshot operations take longer than time_limit? Optional. Default is `false`. + +## Get policies +Introduced 2.1 +{: .label .label-purple } + +Gets SM policies. + +#### Request + +Get all SM policies: + +```json +GET _plugins/_sm/policies +``` +You can use a [query string]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/full-text/index#query-string) and specify pagination, the field to be sorted by, and sort order: + +```json +GET _plugins/_sm/policies?from=0&size=20&sortField=sm_policy.name&sortOrder=desc&queryString=* +``` + +Get a specific SM policy: + +``` +GET _plugins/_sm/policies/ +``` + +### Example + +```json +GET _plugins/_sm/policies/daily-policy +``` + +### Response + +```json +{ + "_id" : "daily-policy-sm-policy", + "_version" : 6, + "_seq_no" : 44696, + "_primary_term" : 19, + "sm_policy" : { + "name" : "daily-policy", + "description" : "Daily snapshot policy", + "schema_version" : 15, + "creation" : { + "schedule" : { + "cron" : { + "expression" : "0 8 * * *", + "timezone" : "UTC" + } + }, + "time_limit" : "1h" + }, + "deletion" : { + "schedule" : { + "cron" : { + "expression" : "0 1 * * *", + "timezone" : "America/Los_Angeles" + } + }, + "condition" : { + "max_age" : "7d", + "min_count" : 7, + "max_count" : 21 + }, + "time_limit" : "1h" + }, + "snapshot_config" : { + "metadata" : { + "any_key" : "any_value" + }, + "ignore_unavailable" : "true", + "include_global_state" : "false", + "date_format" : "yyyy-MM-dd-HH:mm", + "repository" : "s3-repo", + "partial" : "true" + }, + "schedule" : { + "interval" : { + "start_time" : 1656341042874, + "period" : 1, + "unit" : "Minutes" + } + }, + "enabled" : true, + "last_updated_time" : 1656341042874, + "enabled_time" : 1656341042874 + } +} +``` + +## Explain +Introduced 2.1 +{: .label .label-purple } + +Provides the enabled/disabled status and the metadata for all policies specified. Multiple policy names are separated with `,`. You can also specify desired policies with a wildcard pattern. + +SM State Machine + +SM uses a state machine for snapshot creation and deletion. The image on the left shows one execution period of the creation workflow, from the CREATION_START state to the CREATION_FINISHED state. Deletion workflow follows the same pattern as creation workflow. + +The creation workflow starts in the CREATION_START state and continuously checks if the conditions in the creation cron schedule are met. After the conditions are met, the creation workflow switches to the CREATION_CONDITION_MET state and continues to the CREATING state. The CREATING state calls the create snapshot API asynchronously and then waits for snapshot creation to end in the CREATION_FINISHED state. Once snapshot creation ends, the creation workflow goes back to the CREATION_START state, and the cycle continues. The `current_state` field of `metadata.creation` and `metadata.deletion` returns the current state of the state machine. + +#### Request + +```json +GET _plugins/_sm/policies//_explain +``` + +### Example + +```json +GET _plugins/_sm/policies/daily*/_explain +``` + +### Response + +```json +{ + "policies" : [ + { + "name" : "daily-policy", + "creation" : { + "current_state" : "CREATION_START", + "trigger" : { + "time" : 1656403200000 + } + }, + "deletion" : { + "current_state" : "DELETION_START", + "trigger" : { + "time" : 1656403200000 + } + }, + "policy_seq_no" : 44696, + "policy_primary_term" : 19, + "enabled" : true + } + ] +} +``` + +The following table lists all fields for each policy in the response. + +Field | Description +:--- |:--- +`name` | The name of the SM policy. +`creation` | Information about the latest creation operation. See subfields below. +`deletion` | Information about the latest deletion operation. See subfields below. +`policy_seq_no`
`policy_primary_term` | The version of the SM policy. +`enabled` | Is the policy running? + +The following table lists all fields in the `creation` and `deletion` objects of each policy. + +Field | Description +:--- |:--- +`current_state` | The current state of the state machine that runs snapshot creation/deletion as described above. +`trigger.time` | The next creation/deletion execution time in milliseconds since the epoch. +`latest_execution` | Describes the latest creation/deletion execution. +`latest_execution.status` | The execution status of the latest creation/deletion. Possible values are:
`IN_PROGRESS`: Snapshot creation/deletion has started.
`SUCCESS`: Snapshot creation/deletion has finished successfully.
`RETRYING`: The creation/deletion attempt has failed. It will be retried three times.
`FAILED`: The creation/deletion attempt failed after three retries. End the current execution period and go to the next execution period.
`TIME_LIMIT_EXCEEDED`: The creation/deletion time exceeded the time_limit set in the policy. End the current execution period and go to the next execution period. +`latest_execution.start_time` | The start time of the latest execution in milliseconds since the epoch. +`latest_execution.end_time` | The end time of the latest execution in milliseconds since the epoch. +`latest_execution.info.message` | A user-friendly message describing the status of the latest execution. +`latest_execution.info.cause` | Contains the failure reason if the latest execution fails. +`retry.count` | The number of remaining execution retry attempts. + + +## Start a policy +Introduced 2.1 +{: .label .label-purple } + +Starts the policy by setting its `enabled` flag to `true`. + +#### Request + +```json +POST _plugins/_sm/policies//_start +``` + +### Example + +```json +POST _plugins/_sm/policies/daily-policy/_start +``` + +### Response + +```json +{ + "acknowledged" : true +} +``` + +## Stop a policy +Introduced 2.1 +{: .label .label-purple } + +Sets the `enabled` flag to `false` for an SM policy. The policy will not run until you [start](#start-a-policy) it. + +#### Request + +```json +POST _plugins/_sm/policies//_stop +``` + +### Example + +```json +POST _plugins/_sm/policies/daily-policy/_stop +``` + +### Response + +```json +{ + "acknowledged" : true +} +``` + +## Delete a policy +Introduced 2.1 +{: .label .label-purple } + +Deletes the specified SM policy. + +#### Request + +```json +DELETE _plugins/_sm/policies/ +``` + +### Example + +```json +DELETE _plugins/_sm/policies/daily-policy +``` + +### Response + +```json +{ + "_index" : ".opendistro-ism-config", + "_id" : "daily-policy-sm-policy", + "_version" : 8, + "result" : "deleted", + "forced_refresh" : true, + "_shards" : { + "total" : 2, + "successful" : 2, + "failed" : 0 + }, + "_seq_no" : 45366, + "_primary_term" : 20 +} +``` \ No newline at end of file diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-management.md b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-management.md new file mode 100644 index 0000000000..9a25b28683 --- /dev/null +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-management.md @@ -0,0 +1,81 @@ +--- +layout: default +title: Snapshot management +parent: Snapshots +nav_order: 20 +has_children: false +grand_parent: Availability and Recovery +redirect_from: + - /opensearch/snapshots/snapshot-management/ +--- + +# Snapshot management + +Snapshot management (SM) lets you automate [taking snapshots]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#take-snapshots). To use this feature, you need to install the [Index Management (IM) Plugin]({{site.url}}{{site.baseurl}}/im-plugin). Snapshots store only incremental changes since the last snapshot. Thus, while taking an initial snapshot may be a heavy operation, subsequent snapshots have minimal overhead. To set up automatic snapshots, you have to create an SM policy with a desired SM schedule and configuration. + +When you create an SM policy, its document ID is given the name `-sm-policy`. Because of this, SM policies have to obey the following rules: + +- SM policies must have unique names. + +- You cannot update the policy name after its creation. + +SM-created snapshots have names in the format `--`. Two snapshots created by different policies at the same time always have different names because of the `` prefix. To avoid name collisions within the same policy, each snapshot's name contains a random string suffix. + +Each policy has associated metadata that stores the policy status. Snapshot management saves SM policies and metadata in the system index and reads them from the system index. Thus, Snapshot Management depends on the OpenSearch cluster's indexing and searching functions. The policy's metadata keeps information about the latest creation and deletion only. The metadata is read before running every scheduled job so that SM can continue execution from the previous job's state. You can view the metadata using the [explain API]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#explain). + +An SM schedule is a custom [cron]({{site.url}}{{site.baseurl}}/monitoring-plugins/alerting/cron) expression. It consists of two parts: a creation schedule and a deletion schedule. You must set up a creation schedule that specifies the frequency and timing of snapshot creation. Optionally, you can set up a separate schedule for deleting snapshots. + +An SM configuration includes the indexes and repository for the snapshots and supports all parameters you can define when [creating a snapshot]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#take-snapshots) using the API. Additionally, you can specify the format and time zone for the date used in the snapshot's name. + + +## Performance + +One snapshot can contain as many indexes as there are in the cluster. We expect at most dozens of SM policies in one cluster, but a snapshot repository can safely scale to thousands of snapshots. However, to manage its metadata, a large repository requires more memory on the cluster manager node. + +Snapshot Management depends on the Job Scheduler plugin to schedule a job that is run periodically. Each SM policy corresponds to one SM-scheduled job. The scheduled job is lightweight, so the burden of SM depends on the snapshot creation frequency and the burden of running the snapshot operation itself. + +## Concurrency + +An SM policy does not support concurrent snapshot operations, since too many such operations may degrade the cluster. Snapshot operations (creation or deletion) are performed asynchronously. SM does not start a new operation until the previous asynchronous operation finishes. + +We don't recommend creating several SM policies with the same schedule and overlapping indexes in one cluster because it leads to concurrent snapshot creation on the same indexes and hinders performance. +{: .warning } + + +We don't recommend setting up the same repository for multiple SM policies with same schedule in different clusters, since it may cause a sudden spike of burden in this repository. +{: .warning } + +## Failure management + +If a snapshot operation fails, it is retried a maximum of three times. The failure message is saved in `metadata.latest_execution` and is overwritten when a subsequent snapshot operation starts. You can view the failure message using the [explain API]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#explain). When using OpenSearch Dashboards, you can view the failure message on the [policy details page]({{site.url}}{{site.baseurl}}/dashboards/admin-ui-index/sm-dashboards#view-edit-or-delete-an-sm-policy). Possible reasons for failure include red index status and shard reallocation. + +## Security + +The Security plugin has two built-in roles for Snapshot Management actions: `snapshot_management_full_access` and `snapshot_management_read_access`. For descriptions of each, see [Predefined roles]({{site.url}}{{site.baseurl}}/security/access-control/users-roles#predefined-roles). + +The following table lists the required permissions for each Snapshot Management API. + +Function | API | Permission +:--- | :--- | :--- +Get policy | GET _plugins/_sm/policies
GET _plugins/_sm/policies/`policy_name` | cluster:admin/opensearch/snapshot_management/policy/get
cluster:admin/opensearch/snapshot_management/policy/search +Create/update policy | POST _plugins/_sm/policies/`policy_name`
PUT _plugins/_sm/policies/`policy_name`?if_seq_no=1&if_primary_term=1 | cluster:admin/opensearch/snapshot_management/policy/write +Delete policy | DELETE _plugins/_sm/policies/`policy_name` | cluster:admin/opensearch/snapshot_management/policy/delete +Explain | GET _plugins/_sm/policies/`policy_names`/_explain | cluster:admin/opensearch/snapshot_management/policy/explain +Start | POST _plugins/_sm/policies/`policy_name`/_start | cluster:admin/opensearch/snapshot_management/policy/start +Stop| POST _plugins/_sm/policies/`policy_name`/_stop | cluster:admin/opensearch/snapshot_management/policy/stop + + +## API + +The following table lists all [Snapshot Management API]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api) functions. + +Function | API | Description +:--- | :--- | :--- +[Create policy]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#create-or-update-a-policy) | POST _plugins/_sm/policies/`policy_name` | Creates an SM policy. +[Update policy]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#create-or-update-a-policy) | PUT _plugins/_sm/policies/`policy_name`?if_seq_no=`sequence_number`&if_primary_term=`primary_term` | Modifies the `policy_name` policy. +[Get all policies]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#get-policies) | GET _plugins/_sm/policies | Returns all SM policies. +[Get the policy `policy_name`]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#get-policies) | GET _plugins/_sm/policies/`policy_name` | Returns the `policy_name` SM policy. +[Delete policy]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#delete-a-policy) | DELETE _plugins/_sm/policies/`policy_name` | Deletes the `policy_name` policy. +[Explain]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#explain) | GET _plugins/_sm/policies/`policy_names`/_explain | Provides the enabled/disabled status and the metadata for all policies specified by `policy_names`. +[Start policy]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#start-a-policy) | POST _plugins/_sm/policies/`policy_name`/_start | Starts the `policy_name` policy. +[Stop policy]({{site.url}}{{site.baseurl}}/opensearch/snapshots/sm-api#stop-a-policy)| POST _plugins/_sm/policies/`policy_name`/_stop | Stops the `policy_name` policy. \ No newline at end of file diff --git a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md index c51a581029..238cbac18a 100644 --- a/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md +++ b/_tuning-your-cluster/availability-and-recovery/snapshots/snapshot-restore.md @@ -7,6 +7,7 @@ has_children: false grand_parent: Availability and Recovery redirect_from: - /opensearch/snapshots/snapshot-restore/ + - /availability-and-recovery/snapshots/snapshot-restore/ --- # Take and restore snapshots diff --git a/_tuning-your-cluster/availability-and-recovery/stats-api.md b/_tuning-your-cluster/availability-and-recovery/stats-api.md index a0de57ca16..539703367e 100644 --- a/_tuning-your-cluster/availability-and-recovery/stats-api.md +++ b/_tuning-your-cluster/availability-and-recovery/stats-api.md @@ -47,7 +47,7 @@ If `enforced` is `true`: "roles": [ "data", "ingest", - "master", + "cluster_manager", "remote_cluster_client" ], "attributes": { @@ -154,7 +154,7 @@ If `enforced` is `false`: "roles": [ "data", "ingest", - "master", + "cluster_manager", "remote_cluster_client" ], "attributes": { @@ -267,7 +267,7 @@ GET _nodes/_local/stats/shard_indexing_pressure?include_all "roles": [ "data", "ingest", - "master", + "cluster_manager", "remote_cluster_client" ], "attributes": { @@ -382,7 +382,7 @@ If `enforced` is `true`: "roles": [ "data", "ingest", - "master", + "cluster_manager", "remote_cluster_client" ], "attributes": { @@ -425,7 +425,7 @@ If `enforced` is `false`: "roles": [ "data", "ingest", - "master", + "cluster_manager", "remote_cluster_client" ], "attributes": { @@ -474,7 +474,7 @@ GET _nodes/stats/shard_indexing_pressure "roles": [ "data", "ingest", - "master", + "cluster_manager", "remote_cluster_client" ], "attributes": { diff --git a/_tuning-your-cluster/cluster-manager-task-throttling.md b/_tuning-your-cluster/cluster-manager-task-throttling.md new file mode 100644 index 0000000000..ace4547d44 --- /dev/null +++ b/_tuning-your-cluster/cluster-manager-task-throttling.md @@ -0,0 +1,107 @@ +--- +layout: default +title: Cluster manager task throttling +nav_order: 10 +has_children: false +--- + +# Cluster manager task throttling + +For many cluster state updates, such as defining a mapping or creating an index, nodes submit tasks to the cluster manager. The cluster manager maintains a pending task queue for these tasks and runs them in a single-threaded environment. When nodes send tens of thousands of resource-intensive tasks, like `put-mapping` or snapshot tasks, these tasks can pile up in the queue and flood the cluster manager. This affects the cluster manager's performance and may in turn affect the availability of the whole cluster. + +The first line of defense is to implement mechanisms in the caller nodes to avoid task overload on the cluster manager. However, even with those mechanisms in place, the cluster manager needs a built-in way to protect itself: cluster manager task throttling. + +To turn on cluster manager task throttling, you need to set throttling limits. The cluster manager uses the throttling limits to determine whether to reject a task. + +The cluster manager rejects a task based on its type. For any incoming task, the cluster manager evaluates the total number of tasks of the same type in the pending task queue. If this number exceeds the threshold for this task type, the cluster manager rejects the incoming task. Rejecting a task does not affect tasks of a different type. For example, if the cluster manager rejects a `put-mapping` task, it can still accept a subsequent `create-index` task. + +When the cluster manager rejects a task, the node performs retries with exponential backoff to resubmit the task to the cluster manager. If retries are unsuccessful within the timeout period, OpenSearch returns a cluster timeout error. + +## Setting throttling limits + +You can set throttling limits by specifying them in the `cluster_manager.throttling.thresholds` object and updating the [OpenSearch cluster settings]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings). The setting is dynamic, so you can change the behavior of this feature without restarting your cluster. + +By default, throttling is disabled for all task types. +{: .note} + +The request has the following format: + +```json +PUT _cluster/settings +{ + "persistent": { + "cluster_manager.throttling.thresholds" : { + "" : { + "value" : + } + } + } +} +``` + +The following table describes the `cluster_manager.throttling.thresholds` object. + +Field Name | Description +:--- | :--- +task-type | The task type. See [supported task types](#supported-task-types) for a list of valid values. +value | The maximum number of tasks of the `task-type` type in the cluster manager's pending task queue. Default is `-1` (no task throttling). + +## Supported task types + +The following task types are supported: + +- `create-index` +- `update-settings` +- `cluster-update-settings` +- `auto-create` +- `delete-index` +- `delete-dangling-index` +- `create-data-stream` +- `remove-data-stream` +- `rollover-index` +- `index-aliases` +- `put-mapping` +- `create-index-template` +- `remove-index-template` +- `create-component-template` +- `remove-component-template` +- `create-index-template-v2` +- `remove-index-template-v2` +- `put-pipeline` +- `delete-pipeline` +- `create-persistent-task` +- `finish-persistent-task` +- `remove-persistent-task` +- `update-task-state` +- `put-script` +- `delete-script` +- `put-repository` +- `delete-repository` +- `create-snapshot` +- `delete-snapshot` +- `update-snapshot-state` +- `restore-snapshot` +- `cluster-reroute-api` + +#### Example request + +The following request sets the throttling threshold for the `put-mapping` task type to 100: + +```json +PUT _cluster/settings +{ + "persistent": { + "cluster_manager.throttling.thresholds": { + "put-mapping": { + "value": 100 + } + } + } +} +``` + +Set the threshold to `-1` to disable throttling for a task type. +{: .note} + + + diff --git a/_tuning-your-cluster/cluster.md b/_tuning-your-cluster/cluster.md index e4ef23d7fa..99d489a3d3 100644 --- a/_tuning-your-cluster/cluster.md +++ b/_tuning-your-cluster/cluster.md @@ -32,6 +32,7 @@ Cluster manager eligible | Elects one node among them as the cluster manager nod Data | Stores and searches data. Performs all data-related operations (indexing, searching, aggregating) on local shards. These are the worker nodes of your cluster and need more disk space than any other node type. | As you add data nodes, keep them balanced between zones. For example, if you have three zones, add data nodes in multiples of three, one for each zone. We recommend using storage and RAM-heavy nodes. Ingest | Pre-processes data before storing it in the cluster. Runs an ingest pipeline that transforms your data before adding it to an index. | If you plan to ingest a lot of data and run complex ingest pipelines, we recommend you use dedicated ingest nodes. You can also optionally offload your indexing from the data nodes so that your data nodes are used exclusively for searching and aggregating. Coordinating | Delegates client requests to the shards on the data nodes, collects and aggregates the results into one final result, and sends this result back to the client. | A couple of dedicated coordinating-only nodes is appropriate to prevent bottlenecks for search-heavy workloads. We recommend using CPUs with as many cores as you can. +Dynamic | Delegates a specific node for custom work, such as machine learning (ML) tasks, preventing the consumption of resources from data nodes and therefore not affecting any OpenSearch functionality. By default, each node is a cluster-manager-eligible, data, ingest, and coordinating node. Deciding on the number of nodes, assigning node types, and choosing the hardware for each node type depends on your use case. You must take into account factors like the amount of time you want to hold on to your data, the average size of your documents, your typical workload (indexing, searches, aggregations), your expected price-performance ratio, your risk tolerance, and so on. @@ -72,13 +73,13 @@ After you name the cluster, set node attributes for each node in your cluster. Give your cluster manager node a name. If you don't specify a name, OpenSearch assigns a machine-generated name that makes the node difficult to monitor and troubleshoot. ```yml -node.name: opensearch-master +node.name: opensearch-cluster_manager ``` -You can also explicitly specify that this node is a cluster manager node, even though it is already set to true by default. Set the node role to `master` to make it easier to identify the cluster manager node. +You can also explicitly specify that this node is a cluster manager node, even though it is already set to true by default. Set the node role to `cluster_manager` to make it easier to identify the cluster manager node. ```yml -node.roles: [ master ] +node.roles: [ cluster_manager ] ``` #### Data nodes @@ -139,7 +140,7 @@ Zen Discovery is the built-in, default mechanism that uses [unicast](https://en. You can generally just add all of your cluster-manager-eligible nodes to the `discovery.seed_hosts` array. When a node starts up, it finds the other cluster-manager-eligible nodes, determines which one is the cluster manager, and asks to join the cluster. -For example, for `opensearch-master` the line looks something like this: +For example, for `opensearch-cluster_manager` the line looks something like this: ```yml discovery.seed_hosts: ["", "", ""] @@ -169,8 +170,8 @@ curl -XGET https://:9200/_cat/nodes?v -u 'admin:admin' --insecure ``` ``` -ip heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name -x.x.x.x 13 61 0 0.02 0.04 0.05 mi * opensearch-master +ip heap.percent ram.percent cpu load_1m load_5m load_15m node.role cluster_manager name +x.x.x.x 13 61 0 0.02 0.04 0.05 mi * opensearch-cluster_manager x.x.x.x 16 60 0 0.06 0.05 0.05 md - opensearch-d1 x.x.x.x 34 38 0 0.12 0.07 0.06 md - opensearch-d2 x.x.x.x 23 38 0 0.12 0.07 0.06 md - opensearch-c1 @@ -180,6 +181,8 @@ To better understand and monitor your cluster, use the [CAT API]({{site.url}}{{s ## (Advanced) Step 6: Configure shard allocation awareness or forced awareness +### Shard allocation awareness + If your nodes are spread across several geographical zones, you can configure shard allocation awareness to allocate all replica shards to a zone that’s different from their primary shard. With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones. It adds a layer of fault tolerance to ensure your data survives a zone failure beyond just individual node failures. @@ -209,6 +212,8 @@ You can either use `persistent` or `transient` settings. We recommend the `persi Shard allocation awareness attempts to separate primary and replica shards across multiple zones. However, if only one zone is available (such as after a zone failure), OpenSearch allocates replica shards to the only remaining zone. +### Forced awareness + Another option is to require that primary and replica shards are never allocated to the same zone. This is called forced awareness. To configure forced awareness, specify all the possible values for your zone attributes: @@ -230,6 +235,28 @@ If that is not the case, and `opensearch-d1` and `opensearch-d2` do not have the Choosing allocation awareness or forced awareness depends on how much space you might need in each zone to balance your primary and replica shards. +### Replica count enforcement + +To enforce an even distribution of shards across all zones and avoid hotspots, you can set the `routing.allocation.awareness.balance` attribute to `true`. This setting can be configured in the opensearch.yml file and dynamically updated using the cluster update settings API: + +```json +PUT _cluster/settings +{ + "persistent": { + "cluster": { + "routing.allocation.awareness.balance": "true" + } + } +} +``` + +The `routing.allocation.awareness.balance` setting is false by default. When it is set to `true`, the total number of shards for the index must be a multiple of the highest count for any awareness attribute. For example, consider a configuration with two awareness attributes—zones and rack IDs. Let's say there are two zones and three rack IDs. The highest count of either the number of zones or the number of rack IDs is three. Therefore, the number of shards must be a multiple of three. If it is not, OpenSearch throws a validation exception. + +`routing.allocation.awareness.balance` takes effect only if `cluster.routing.allocation.awareness.attributes` and `cluster.routing.allocation.awareness.force.zone.values` are set. +{: .note} + +`routing.allocation.awareness.balance` applies to all operations that create or update indices. For example, let's say you're running a cluster with three nodes and three zones in a zone-aware setting. If you try to create an index with one replica or update an index's settings to one replica, the attempt will fail with a validation exception because the number of shards must be a multiple of three. Similarly, if you try to create an index template with one shard and no replicas, the attempt will fail for the same reason. However, in all of those operations, if you set the number of shards to one and the number of replicas to two, the total number of shards is three and the attempt will succeed. + ## (Advanced) Step 7: Set up a hot-warm architecture You can design a hot-warm architecture where you first index your data to hot nodes---fast and expensive---and after a certain period of time move them to warm nodes---slow and cheap. diff --git a/_tuning-your-cluster/replication-plugin/api.md b/_tuning-your-cluster/replication-plugin/api.md new file mode 100644 index 0000000000..bec1721d16 --- /dev/null +++ b/_tuning-your-cluster/replication-plugin/api.md @@ -0,0 +1,394 @@ +--- +layout: default +title: API +nav_order: 50 +parent: Cross-cluster replication +redirect_from: + - /replication-plugin/api/ +--- + +# Cross-cluster replication API + +Use these replication operations to programmatically manage cross-cluster replication. + +#### Table of contents +- TOC +{:toc} + +## Start replication +Introduced 1.1 +{: .label .label-purple } + +Initiate replication of an index from the leader cluster to the follower cluster. Send this request to the follower cluster. + + +#### Request + +```json +PUT /_plugins/_replication//_start +{ + "leader_alias":"", + "leader_index":"", + "use_roles":{ + "leader_cluster_role":"", + "follower_cluster_role":"" + } +} +``` + +Specify the following options: + +Options | Description | Type | Required +:--- | :--- |:--- |:--- | +`leader_alias` | The name of the cross-cluster connection. You define this alias when you [set up a cross-cluster connection]({{site.url}}{{site.baseurl}}/replication-plugin/get-started/#set-up-a-cross-cluster-connection). | `string` | Yes +`leader_index` | The index on the leader cluster that you want to replicate. | `string` | Yes +`use_roles` | The roles to use for all subsequent backend replication tasks between the indexes. Specify a `leader_cluster_role` and `follower_cluster_role`. See [Map the leader and follower cluster roles]({{site.url}}{{site.baseurl}}/replication-plugin/permissions/#map-the-leader-and-follower-cluster-roles). | `string` | If security plugin is enabled + +#### Example response + +```json +{ + "acknowledged": true +} +``` + +## Stop replication +Introduced 1.1 +{: .label .label-purple } + +Terminates replication and converts the follower index to a standard index. Send this request to the follower cluster. + +#### Request + +```json +POST /_plugins/_replication//_stop +{} +``` + +#### Example response + +```json +{ + "acknowledged": true +} +``` + +## Pause replication +Introduced 1.1 +{: .label .label-purple } + +Pauses replication of the leader index. Send this request to the follower cluster. + +#### Request + +```json +POST /_plugins/_replication//_pause +{} +``` + +You can't resume replication after it's been paused for more than 12 hours. You must [stop replication]({{site.url}}{{site.baseurl}}/replication-plugin/api/#stop-replication), delete the follower index, and restart replication of the leader. + +#### Example response + +```json +{ + "acknowledged": true +} +``` + +## Resume replication +Introduced 1.1 +{: .label .label-purple } + +Resumes replication of the leader index. Send this request to the follower cluster. + +#### Request + +```json +POST /_plugins/_replication//_resume +{} +``` + +#### Example response + +```json +{ + "acknowledged": true +} +``` + +## Get replication status +Introduced 1.1 +{: .label .label-purple } + +Gets the status of index replication. Possible statuses are `SYNCING`, `BOOTSTRAPING`, `PAUSED`, and `REPLICATION NOT IN PROGRESS`. Use the syncing details to measure replication lag. Send this request to the follower cluster. + +#### Request + +```json +GET /_plugins/_replication//_status +``` + +#### Example response + +```json +{ + "status" : "SYNCING", + "reason" : "User initiated", + "leader_alias" : "my-connection-name", + "leader_index" : "leader-01", + "follower_index" : "follower-01", + "syncing_details" : { + "leader_checkpoint" : 19, + "follower_checkpoint" : 19, + "seq_no" : 0 + } +} +``` +To include shard replication details in the response, add the `&verbose=true` parameter. + +The leader and follower checkpoint values begin as negative integers and reflect the shard count (-1 for one shard, -5 for five shards, and so on). The values increment toward positive integers with each change that you make. For example, when you make a change on the leader index, the `leader_checkpoint` becomes `0`. The `follower_checkpoint` is initially still `-1` until the follower index pulls the change from the leader, at which point it increments to `0`. If the values are the same, it means the indexes are fully synced. + +## Get leader cluster stats +Introduced 1.1 +{: .label .label-purple } + +Gets information about replicated leader indexes on a specified cluster. + +#### Request + +```json +GET /_plugins/_replication/leader_stats +``` + +#### Example response + +```json +{ + "num_replicated_indices": 2, + "operations_read": 15, + "translog_size_bytes": 1355, + "operations_read_lucene": 0, + "operations_read_translog": 15, + "total_read_time_lucene_millis": 0, + "total_read_time_translog_millis": 659, + "bytes_read": 1000, + "index_stats":{ + "leader-index-1":{ + "operations_read": 7, + "translog_size_bytes": 639, + "operations_read_lucene": 0, + "operations_read_translog": 7, + "total_read_time_lucene_millis": 0, + "total_read_time_translog_millis": 353, + "bytes_read":466 + }, + "leader-index-2":{ + "operations_read": 8, + "translog_size_bytes": 716, + "operations_read_lucene": 0, + "operations_read_translog": 8, + "total_read_time_lucene_millis": 0, + "total_read_time_translog_millis": 306, + "bytes_read": 534 + } + } +} +``` + +## Get follower cluster stats +Introduced 1.1 +{: .label .label-purple } + +Gets information about follower (syncing) indexes on a specified cluster. + +#### Request + +```json +GET /_plugins/_replication/follower_stats +``` + +#### Example response + +```json +{ + "num_syncing_indices": 2, + "num_bootstrapping_indices": 0, + "num_paused_indices": 0, + "num_failed_indices": 0, + "num_shard_tasks": 2, + "num_index_tasks": 2, + "operations_written": 3, + "operations_read": 3, + "failed_read_requests": 0, + "throttled_read_requests": 0, + "failed_write_requests": 0, + "throttled_write_requests": 0, + "follower_checkpoint": 1, + "leader_checkpoint": 1, + "total_write_time_millis": 2290, + "index_stats":{ + "follower-index-1":{ + "operations_written": 2, + "operations_read": 2, + "failed_read_requests": 0, + "throttled_read_requests": 0, + "failed_write_requests": 0, + "throttled_write_requests": 0, + "follower_checkpoint": 1, + "leader_checkpoint": 1, + "total_write_time_millis": 1355 + }, + "follower-index-2":{ + "operations_written": 1, + "operations_read": 1, + "failed_read_requests": 0, + "throttled_read_requests": 0, + "failed_write_requests": 0, + "throttled_write_requests": 0, + "follower_checkpoint": 0, + "leader_checkpoint": 0, + "total_write_time_millis": 935 + } + } +} +``` + +## Get auto-follow stats +Introduced 1.1 +{: .label .label-purple } + +Gets information about auto-follow activity and any replication rules configured on the specified cluster. + +#### Request + +```json +GET /_plugins/_replication/autofollow_stats +``` + +#### Example response + +```json +{ + "num_success_start_replication": 2, + "num_failed_start_replication": 0, + "num_failed_leader_calls": 0, + "failed_indices":[ + + ], + "autofollow_stats":[ + { + "name":"my-replication-rule", + "pattern":"movies*", + "num_success_start_replication": 2, + "num_failed_start_replication": 0, + "num_failed_leader_calls": 0, + "failed_indices":[ + + ] + } + ] +} +``` + +## Update settings +Introduced 1.1 +{: .label .label-purple } + +Updates settings on the follower index. + +#### Request + +```json +PUT /_plugins/_replication//_update +{ + "settings":{ + "index.number_of_shards": 4, + "index.number_of_replicas": 2 + } +} +``` + +#### Example response + +```json +{ + "acknowledged": true +} +``` + +## Create replication rule +Introduced 1.1 +{: .label .label-purple } + +Automatically starts replication on indexes matching a specified pattern. If a new index on the leader cluster matches the pattern, OpenSearch automatically creates a follower index and begins replication. You can also use this API to update existing replication rules. + +Send this request to the follower cluster. + +Make sure to note the names of all auto-follow patterns after you create them. The replication plugin currently does not include an API operation to retrieve a list of existing patterns. +{: .tip } + +#### Request + +```json +POST /_plugins/_replication/_autofollow +{ + "leader_alias" : "", + "name": "", + "pattern": "", + "use_roles":{ + "leader_cluster_role": "", + "follower_cluster_role": "" + } +} +``` + +Specify the following options: + +Options | Description | Type | Required +:--- | :--- |:--- |:--- | +`leader_alias` | The name of the cross-cluster connection. You define this alias when you [set up a cross-cluster connection]({{site.url}}{{site.baseurl}}/replication-plugin/get-started/#set-up-a-cross-cluster-connection). | `string` | Yes +`name` | A name for the auto-follow pattern. | `string` | Yes +`pattern` | An array of index patterns to match against indexes in the specified leader cluster. Supports wildcard characters. For example, `leader-*`. | `string` | Yes +`use_roles` | The roles to use for all subsequent backend replication tasks between the indexes. Specify a `leader_cluster_role` and `follower_cluster_role`. See [Map the leader and follower cluster roles]({{site.url}}{{site.baseurl}}/replication-plugin/permissions/#map-the-leader-and-follower-cluster-roles). | `string` | If security plugin is enabled + +#### Example response + +```json +{ + "acknowledged": true +} +``` + +## Delete replication rule +Introduced 1.1 +{: .label .label-purple } + +Deletes the specified replication rule. This operation prevents any new indexes from being replicated but does not stop existing replication that the rule has already initiated. Replicated indexes remain read-only until you stop replication. + +Send this request to the follower cluster. + +#### Request + +```json +DELETE /_plugins/_replication/_autofollow +{ + "leader_alias" : "", + "name": "", +} +``` + +Specify the following options: + +Options | Description | Type | Required +:--- | :--- |:--- |:--- | +`leader_alias` | The name of the cross-cluster connection. You define this alias when you [set up a cross-cluster connection]({{site.url}}{{site.baseurl}}/replication-plugin/get-started/#set-up-a-cross-cluster-connection). | `string` | Yes +`name` | The name of the pattern. | `string` | Yes + +#### Example response + +```json +{ + "acknowledged": true +} +``` From 87700d2fcc6b9ff45e904799d4e7b501b054191a Mon Sep 17 00:00:00 2001 From: Naarcha-AWS Date: Wed, 8 Mar 2023 15:08:40 -0600 Subject: [PATCH 3/3] Remove non 1.3 features Signed-off-by: Naarcha-AWS --- .../cluster-api/cluster-allocation.md | 147 ----------- .../cluster-api/cluster-awareness.md | 130 ---------- .../cluster-api/cluster-decommission.md | 86 ------- _search-plugins/neural-search.md | 204 --------------- _search-plugins/search-relevance/index.md | 157 ------------ .../availability-and-recovery/remote.md | 234 ------------------ .../cluster-manager-task-throttling.md | 107 -------- 7 files changed, 1065 deletions(-) delete mode 100644 _api-reference/cluster-api/cluster-allocation.md delete mode 100644 _api-reference/cluster-api/cluster-awareness.md delete mode 100644 _api-reference/cluster-api/cluster-decommission.md delete mode 100644 _search-plugins/neural-search.md delete mode 100644 _search-plugins/search-relevance/index.md delete mode 100644 _tuning-your-cluster/availability-and-recovery/remote.md delete mode 100644 _tuning-your-cluster/cluster-manager-task-throttling.md diff --git a/_api-reference/cluster-api/cluster-allocation.md b/_api-reference/cluster-api/cluster-allocation.md deleted file mode 100644 index 31b5553756..0000000000 --- a/_api-reference/cluster-api/cluster-allocation.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -layout: default -title: Cluster allocation explain -nav_order: 10 -parent: Cluster APIs -has_children: false ---- - -# Cluster allocation explain -Introduced 1.0 -{: .label .label-purple } - -The most basic cluster allocation explain request finds an unassigned shard and explains why it can't be allocated to a node. - -If you add some options, you can instead get information on a specific shard, including why OpenSearch assigned it to its current node. - - -## Example - -```json -GET _cluster/allocation/explain?include_yes_decisions=true -{ - "index": "movies", - "shard": 0, - "primary": true -} -``` -{% include copy-curl.html %} - -## Path and HTTP methods - -``` -GET _cluster/allocation/explain -POST _cluster/allocation/explain -``` - - -## URL parameters - -All cluster allocation explain parameters are optional. - -Parameter | Type | Description -:--- | :--- | :--- -include_yes_decisions | Boolean | OpenSearch makes a series of yes or no decisions when trying to allocate a shard to a node. If this parameter is true, OpenSearch includes the (generally more numerous) "yes" decisions in its response. Default is false. -include_disk_info | Boolean | Whether to include information about disk usage in the response. Default is false. - - -## Request body - -All cluster allocation explain fields are optional. - -Field | Type | Description -:--- | :--- | :--- -current_node | String | If you only want an explanation if the shard happens to be on a particular node, specify that node name here. -index | String | The name of the shard's index. -primary | Boolean | Whether to provide an explanation for the primary shard (true) or its first replica (false), which share the same shard ID. -shard | Integer | The shard ID that you want an explanation for. - - -## Response - -```json -{ - "index": "movies", - "shard": 0, - "primary": true, - "current_state": "started", - "current_node": { - "id": "d8jRZcW1QmCBeVFlgOJx5A", - "name": "opensearch-node1", - "transport_address": "172.24.0.4:9300", - "weight_ranking": 1 - }, - "can_remain_on_current_node": "yes", - "can_rebalance_cluster": "yes", - "can_rebalance_to_other_node": "no", - "rebalance_explanation": "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance", - "node_allocation_decisions": [{ - "node_id": "vRxi4uPcRt2BtHlFoyCyTQ", - "node_name": "opensearch-node2", - "transport_address": "172.24.0.3:9300", - "node_decision": "no", - "weight_ranking": 1, - "deciders": [{ - "decider": "max_retry", - "decision": "YES", - "explanation": "shard has no previous failures" - }, - { - "decider": "replica_after_primary_active", - "decision": "YES", - "explanation": "shard is primary and can be allocated" - }, - { - "decider": "enable", - "decision": "YES", - "explanation": "all allocations are allowed" - }, - { - "decider": "node_version", - "decision": "YES", - "explanation": "can relocate primary shard from a node with version [1.0.0] to a node with equal-or-newer version [1.0.0]" - }, - { - "decider": "snapshot_in_progress", - "decision": "YES", - "explanation": "no snapshots are currently running" - }, - { - "decider": "restore_in_progress", - "decision": "YES", - "explanation": "ignored as shard is not being recovered from a snapshot" - }, - { - "decider": "filter", - "decision": "YES", - "explanation": "node passes include/exclude/require filters" - }, - { - "decider": "same_shard", - "decision": "NO", - "explanation": "a copy of this shard is already allocated to this node [[movies][0], node[vRxi4uPcRt2BtHlFoyCyTQ], [R], s[STARTED], a[id=x8w7QxWdQQa188HKGn0iMQ]]" - }, - { - "decider": "disk_threshold", - "decision": "YES", - "explanation": "enough disk for shard on node, free: [35.9gb], shard size: [15.1kb], free after allocating shard: [35.9gb]" - }, - { - "decider": "throttling", - "decision": "YES", - "explanation": "below shard recovery limit of outgoing: [0 < 2] incoming: [0 < 2]" - }, - { - "decider": "shards_limit", - "decision": "YES", - "explanation": "total shard limits are disabled: [index: -1, cluster: -1] <= 0" - }, - { - "decider": "awareness", - "decision": "YES", - "explanation": "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it" - } - ] - }] -} -``` diff --git a/_api-reference/cluster-api/cluster-awareness.md b/_api-reference/cluster-api/cluster-awareness.md deleted file mode 100644 index 0c0fd49dcd..0000000000 --- a/_api-reference/cluster-api/cluster-awareness.md +++ /dev/null @@ -1,130 +0,0 @@ ---- -layout: default -title: Cluster routing and awareness -nav_order: 20 -parent: Cluster APIs -has_children: false -redirect_from: - - /api-reference/cluster-awareness/ ---- - -# Cluster routing and awareness - -To control the distribution of search or HTTP traffic, you can use the weights per awareness attribute to control the distribution of search or HTTP traffic across zones. This is commonly used for zonal deployments, heterogeneous instances, and routing traffic away from zones during zonal failure. - -## Path and HTTP methods - -``` -PUT /_cluster/routing/awareness//weights -GET /_cluster/routing/awareness//weights?local -GET /_cluster/routing/awareness//weights -``` - -## Path parameters - -Parameter | Type | Description -:--- | :--- | :--- -attribute | String | The name of the awareness attribute, usually `zone`. The attribute name must match the values listed in the request body when assigning weights to zones. - -## Request body parameters - -Parameter | Type | Description -:--- | :--- | :--- -weights | JSON object | Assigns weights to attributes within the request body of the PUT request. Weights can be set in any ratio, for example, 2:3:5. In a 2:3:5 ratio with 3 zones, for every 100 requests sent to the cluster, each zone would receive either 20, 30, or 50 search requests in a random order. When assigned a weight of `0`, the zone does not receive any search traffic. -_version | String | Implements optimistic concurrency control (OCC) through versioning. The parameter uses simple versioning, such as `1`, and increments upward based on each subsequent modification. This allows any servers from which a request originates to validate whether or not a zone has been modified. - - -In the following example request body, `zone_1` and `zone_2` receive 50 requests each, whereas `zone_3` is prevented from receiving requests: - -``` -{ - "weights": - { - "zone_1": "5", - "zone_2": "5", - "zone_3": "0" - } - "_version" : 1 -} -``` - -## Example: Weighted round robin search - -The following example request creates a round robin shard allocation for search traffic by using an undefined ratio: - -### Request - -```json -PUT /_cluster/routing/awareness/zone/weights -{ - "weights": - { - "zone_1": "1", - "zone_2": "1", - "zone_3": "0" - } - "_version" : 1 -} -``` -{% include copy-curl.html %} - -### Response - -``` -{ - "acknowledged": true -} -``` - - -## Example: Getting weights for all zones - -The following example request gets weights for all zones. - -### Request - -```json -GET /_cluster/routing/awareness/zone/weights -``` -{% include copy-curl.html %} - -### Response - -OpenSearch responds with the weight of each zone: - -```json -{ - "weights": - { - - "zone_1": "1.0", - "zone_2": "1.0", - "zone_3": "0.0" - }, - "_version":1 -} -``` - -## Example: Deleting weights - -You can remove your weight ratio for each zone using the `DELETE` method. - -### Request - -```json -DELETE /_cluster/routing/awareness/zone/weights -``` -{% include copy-curl.html %} - -### Response - -```json -{ - "_version":1 -} -``` - -## Next steps - -- For more information about zone commissioning, see [Cluster decommission]({{site.url}}{{site.baseurl}}/api-reference/cluster-decommission/). -- For more information about allocation awareness, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/#advanced-step-6-configure-shard-allocation-awareness-or-forced-awareness). diff --git a/_api-reference/cluster-api/cluster-decommission.md b/_api-reference/cluster-api/cluster-decommission.md deleted file mode 100644 index e64e2675fe..0000000000 --- a/_api-reference/cluster-api/cluster-decommission.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -layout: default -title: Cluster decommission -nav_order: 30 -parent: Cluster APIs -has_children: false -redirect_from: - - /api-reference/cluster-decommission/ ---- - -# Cluster decommission - -The cluster decommission operation adds support decommissioning based on awareness. It greatly benefits multi-zone deployments, where awareness attributes, such as `zones`, can aid in applying new upgrades to a cluster in a controlled fashion. This is especially useful during outages, in which case, you can decommission the unhealthy zone to prevent replication requests from stalling and prevent your request backlog from becoming too large. - -For more information about allocation awareness, see [Shard allocation awareness]({{site.url}}{{site.baseurl}}//opensearch/cluster/#shard-allocation-awareness). - - -## HTTP and Path methods - -``` -PUT /_cluster/decommission/awareness/{awareness_attribute_name}/{awareness_attribute_value} -GET /_cluster/decommission/awareness/{awareness_attribute_name}/_status -DELETE /_cluster/decommission/awareness -``` - -## URL parameters - -Parameter | Type | Description -:--- | :--- | :--- -awareness_attribute_name | String | The name of awareness attribute, usually `zone`. -awareness_attribute_value | String | The value of the awareness attribute. For example, if you have shards allocated in two different zones, you can give each zone a value of `zone-a` or `zoneb`. The cluster decommission operation decommissions the zone listed in the method. - - -## Example: Decommissioning and recommissioning a zone - -You can use the following example requests to decommission and recommission a zone: - -### Request - -The following example request decommissions `zone-a`: - -```json -PUT /_cluster/decommission/awareness// -``` -{% include copy-curl.html %} - -If you want to recommission a decommissioned zone, you can use the `DELETE` method: - -```json -DELETE /_cluster/decommission/awareness -``` -{% include copy-curl.html %} - -### Response - - -```json -{ - "acknowledged": true -} -``` - -## Example: Getting zone decommission status - -The following example requests returns the decommission status of all zones. - -### Request - -```json -GET /_cluster/decommission/awareness/zone/_status -``` -{% include copy-curl.html %} - -### Response - -```json -{ - "zone-1": "INIT | DRAINING | IN_PROGRESS | SUCCESSFUL | FAILED" -} -``` - - -## Next steps - -- For more information about zone awareness and weight, see [Cluster awareness]({{site.url}}{{site.baseurl}}/api-reference/cluster-awareness/). -- For more information about allocation awareness, see [Cluster formation]({{site.url}}{{site.baseurl}}/opensearch/cluster/#advanced-step-6-configure-shard-allocation-awareness-or-forced-awareness). diff --git a/_search-plugins/neural-search.md b/_search-plugins/neural-search.md deleted file mode 100644 index 6cf199c52a..0000000000 --- a/_search-plugins/neural-search.md +++ /dev/null @@ -1,204 +0,0 @@ ---- -layout: default -title: Neural Search plugin -nav_order: 200 -has_children: false -has_toc: false -redirect_from: - - /neural-search-plugin/index/ ---- - -# Neural Search plugin - -The Neural Search plugin is an experimental feature. For updates on the progress of the Neural Search plugin, or if you want to leave feedback that could help improve the feature, join the discussion in the [Neural Search forum](https://forum.opensearch.org/t/feedback-neural-search-plugin-experimental-release/11501). -{: .warning} - -The OpenSearch Neural Search plugin enables the integration of machine learning (ML) language models into your search workloads. During ingestion and search, the Neural Search plugin transforms text into vectors. Then, Neural Search uses the transformed vectors in vector-based search. - -The Neural Search plugin comes bundled with OpenSearch. For more information, see [Managing plugins]({{site.url}}{{site.baseurl}}/opensearch/install/plugins#managing-plugins). - -## Ingest data with Neural Search - -In order to ingest vectorized documents, you need to create a Neural Search _pipeline_. A pipeline consists of a series of processors that manipulate documents during ingestion, allowing the documents to be vectorized. The following API operation creates a Neural Search pipeline: - -``` -PUT _ingest/pipeline/ -``` - -In the pipeline request body, The `text_embedding` processor, the only processor supported by Neural Search, converts a document's text to vector embeddings. `text_embedding` uses `field_map`s to determine what fields from which to generate vector embeddings and also which field to store the embedding. - -### Path parameter - -Use `pipeline_name` to create a name for your Neural Search pipeline. - -### Request fields - -Field | Data type | Description -:--- | :--- | :--- -description | string | A description of the processor. -model_id | string | The ID of the model that will be used in the embedding interface. The model must be indexed in OpenSearch before it can be used in Neural Search. For more information, see [Model Serving Framework]({{site.url}}{{site.baseurl}}/ml-commons-plugin/model-serving-framework/) -input_field_name | string | The field name used to cache text for text embeddings. -output_field_name | string | The name of the field in which output text is stored. - -### Example request - -Use the following example request to create a pipeline: - -``` -PUT _ingest/pipeline/nlp-pipeline -{ - "description": "An example neural search pipeline", - "processors" : [ - { - "text_embedding": { - "model_id": "bxoDJ7IHGM14UqatWc_2j", - "field_map": { - "passage_text": "passage_embedding" - } - } - } - ] -} -``` - -### Example response - -OpenSearch responds with an acknowledgment of the pipeline's creation. - -```json -PUT _ingest/pipeline/nlp-pipeline -{ - "acknowledged" : true -} -``` - -## Create an index for ingestion - -In order to use the text embedding processor defined in your pipelines, create an index with mapping data that aligns with the maps specified in your pipeline. For example, the `output_fields` defined in the `field_map` field of your processor request must map to the k-NN vector fields with a dimension that matches the model. Similarly, the `text_fields` defined in your processor should map to the `text_fields` in your index. - -### Example request - -The following example request creates an index that attaches to a Neural Search pipeline. Because the index maps to k-NN vector fields, the index setting field `index-knn` is set to `true`. Furthermore, `mapping` settings use [k-NN method definitions]({{site.url}}{{site.baseurl}}/search-plugins/knn/knn-index/#method-definitions) to match the maps defined in the Neural Search pipeline. - -```json -PUT /my-nlp-index-1 -{ - "settings": { - "index.knn": true, - "default_pipeline": "" - }, - "mappings": { - "properties": { - "passage_embedding": { - "type": "knn_vector", - "dimension": int, - "method": { - "name": "string", - "space_type": "string", - "engine": "string", - "parameters": json_object - } - }, - "passage_text": { - "type": "text" - }, - } - } -} -``` - -### Example response - -OpenSearch responds with information about your new index: - -```json -{ - "acknowledged" : true, - "shards_acknowledged" : true, - "index" : "my-nlp-index-1" -} -``` - -## Ingest documents into Neural Search - -Document ingestion is managed by OpenSearch's [Ingest API]({{site.url}}{{site.baseurl}}/api-reference/ingest-apis/index/), similarly to other OpenSearch indexes. For example, you can ingest a document that contains the `passage_text: "Hello world"` with a simple POST method: - -```json -POST /my-nlp-index-1/_doc -{ - "passage_text": "Hello world" -} -``` - -With the text_embedding processor in place through a Neural Search pipeline, the example indexes "Hello world" as a `text_field` and converts "Hello world" into an associated k-NN vector field. - -## Search a neural index - -If you want to use a language model to convert a text query into a k-NN vector query, use the `neural` query fields in your query. The neural query request fields can be used in both the [k-NN plugin API]({{site.url}}{{site.baseurl}}/search-plugins/knn/api/#search-model) and [Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl/index/). Furthermore, you can use a [k-NN search filter]({{site.url}}{{site.baseurl}}/search-plugins/knn/filter-search-knn/) to refine your neural search query. - - - -### Neural request fields - -Include the following request fields under the `neural` field in your query: - -Field | Data type | Description -:--- | :--- | :--- -vector_field | string | The vector field against which to run a search query. -query_text | string | The query text from which to produce queries. -model_id | string | The ID of the model that will be used in the embedding interface. The model must be indexed in OpenSearch before it can be used in Neural Search. -k | integer | The number of results the k-NN search returns. - - -### Example request - -The following example request uses a search query that returns vectors for the "Hello World" query text: - - -```json -GET my_index/_search -{ - "query": { - "bool" : { - "filter": { - "range": { - "distance": { "lte" : 20 } - } - }, - "should" : [ - { - "script_score": { - "query": { - "neural": { - "passage_vector": { - "query_text": "Hello world", - "model_id": "xzy76xswsd", - "k": 100 - } - } - }, - "script": { - "source": "_score * 1.5" - } - } - } - , - { - "script_score": { - "query": { - "match": { "passage_text": "Hello world" } - }, - "script": { - "source": "_score * 1.7" - } - } - } - ] - } - } -} -``` - - - - diff --git a/_search-plugins/search-relevance/index.md b/_search-plugins/search-relevance/index.md deleted file mode 100644 index 7c020952ea..0000000000 --- a/_search-plugins/search-relevance/index.md +++ /dev/null @@ -1,157 +0,0 @@ ---- -layout: default -title: Search relevance -nav_order: 55 -has_children: false -has_toc: false -redirect_from: - - /search-plugins/search-relevance/ ---- - -# Compare search results - -Compare Search Results is an experimental feature. For updates on the progress of Compare Search Results and other search relevance features, or if you want to leave feedback that could help improve the feature, join the [discussion on the OpenSearch forum](https://forum.opensearch.org/t/feedback-experimental-feature-compare-search-results/11331). -{: .warning} - -Compare Search Results is the first search relevance feature in OpenSearch. It lets you compare search results from two queries side by side to determine whether one query produces better results than the other. Using this tool, you can evaluate search quality by experimenting with queries. - -For example, you can see how results change when you apply one of the following query changes: - -- Weighting different fields differently -- Different stemming or lemmatization strategies -- Shingling - -## Prerequisites - -Before you get started, you must index data in OpenSearch. To learn how to create a new index, see [Index data]({{site.url}}{{site.baseurl}}/opensearch/index-data). - -Alternatively, you can add sample data in OpenSearch Dashboards using the following steps: - -1. On the top menu bar, go to **OpenSearch Dashboards > Overview**. -1. Select **View app directory**. -1. Select **Add sample data**. -1. Choose one of the built-in datasets and select **Add data**. - -## Using Compare Search Results in OpenSearch Dashboards - -To compare search results in OpenSearch Dashboards, perform the following steps. - -**Step 1:** On the top menu bar, go to **OpenSearch Plugins > Search Relevance**. - -**Step 2:** Enter the search text in the search bar. - -**Step 3:** Select an index for **Query 1** and enter a query (request body only) in [OpenSearch Query DSL]({{site.url}}{{site.baseurl}}/opensearch/query-dsl). The `GET` HTTP method and the `_search` endpoint are implicit. Use the `%SearchText%` variable to refer to the text in the search bar. - -The following is an example query: - -```json -{ - "query": { - "multi_match": { - "query": "%SearchText%", - "fields": [ "description", "item_name" ] - } - } -} -``` - -**Step 4:** Select an index for **Query 2** and enter a query (request body only). - -The following example query boosts the `title` field in search results: - -```json -{ - "query": { - "multi_match": { - "query": "%SearchText%", - "fields": [ "description", "item_name^3" ] - } - } -} -``` - -**Step 5:** Select **Search** and compare the results in **Result 1** and **Result 2**. - -The following example screen shows a search for the word "cup" in the `description` and `item_name` fields with and without boosting the `item_name`: - -Compare search results{: .img-fluid } - -If a result in Result 1 appears in Result 2, the `Up` and `Down` indicators below the result number signify how many places the result moved up or down compared to the same result in Result 2. In this example, the document with the ID 2 is `Up 1` place in Result 2 compared to Result 1 and `Down 1` place in Result 1 compared to Result 2. - -## Changing the number of results - -By default, OpenSearch returns the top 10 results. To change the number of returned results to a different value, specify the `size` parameter in the query: - -```json -{ - "size": 15, - "query": { - "multi_match": { - "query": "%SearchText%", - "fields": [ "title^3", "text" ] - } - } -} -``` - -Setting `size` to a high value (for example, larger than 250 documents) may degrade performance. -{: .note} - -You cannot save a given comparison for future use, so Compare Search Results is not suitable for systematic testing. -{: .note} - -## Comparing OpenSearch search results with re-ranked results - -One use case for Compare Search Results is to compare raw OpenSearch results with the same results processed by a re-ranking application. An example of such a re-ranker is **Kendra Intelligent Ranking for OpenSearch**, contributed by the Amazon Kendra team. This plugin takes search results from OpenSearch and applies Amazon Kendra’s semantic relevance rankings calculated using vector embeddings and other semantic search techniques. For many applications, this provides better result rankings. - -To try Kendra Intelligent Ranking, you must first set up the Amazon Kendra service. To get started, see [Amazon Kendra](https://aws.amazon.com/kendra/). For detailed information, including plugin setup instructions, see [Intelligently ranking OpenSearch (self managed) results using Amazon Kendra](https://docs.aws.amazon.com/kendra/latest/dg/opensearch-rerank.html). - -Once you've set up Kendra Intelligent Ranking, enter a query in **Query 1** and enter the same query using Kendra Intelligent Ranking in **Query 2**. Then compare the search results from OpenSearch and Amazon Kendra. - -### Example - -The following example searches for the text "snacking nuts" in the `abo` index. The documents in the index contain snack descriptions in the `bullet_point` array. - -OpenSearch Intelligent Ranking query{: .img-fluid } - -1. Enter `snacking nuts` in the search bar. -1. Enter the following query, which searches the `bullet_point` field for the search text "snacking nuts", in **Query 1**: - - ```json - { - "query": { - "match": { - "bullet_point": "%SearchText%" - } - }, - "size": 25 - } - ``` -1. Enter the same query with intelligent ranking in **Query 2**: - - ```json - { - "query" : { - "match" : { - "bullet_point": "%SearchText%" - } - }, - "size": 25, - "ext": { - "search_configuration":{ - "result_transformer" : { - "kendra_intelligent_ranking": { - "order": 1, - "properties": { - "title_field": "item_name", - "body_field": "bullet_point" - } - } - } - } - } - } - ``` - - In the preceding query, `body_field` refers to the body field of the documents in the index, which Kendra Intelligent Ranking uses to rank the results. The `body_field` is required, while the `title_field` is optional. -1. Select **Search** and compare the results in **Result 1** and **Result 2**. \ No newline at end of file diff --git a/_tuning-your-cluster/availability-and-recovery/remote.md b/_tuning-your-cluster/availability-and-recovery/remote.md deleted file mode 100644 index 99f5e787cf..0000000000 --- a/_tuning-your-cluster/availability-and-recovery/remote.md +++ /dev/null @@ -1,234 +0,0 @@ ---- -layout: default -title: Remote-backed storage -nav_order: 40 -parent: Availability and Recovery -redirect_from: - - /opensearch/remote/ ---- - -# Remote-backed storage - -Remote-backed storage is an experimental feature. Therefore, we do not recommend the use of remote-backed storage in a production environment. For updates on the progress of remote-backed storage, or if you want leave feedback that could help improve the feature, refer to the issue on [GitHub](https://github.com/opensearch-project/OpenSearch/issues/1968). -{: .warning} - -Remote-backed storage offers OpenSearch users a new way to protect against data loss by automatically creating backups of all index transactions and sending them to remote storage. In order to expose this feature, segment replication must also be enabled. See [Segment replication]({{site.url}}{{site.baseurl}}/opensearch/segment-replication/) for additional information. - -## Translog - -Any index changes, such as indexing or deleting documents, are written to disk during a Lucene commit. However, Lucene commits are expensive operations, so they cannot be performed after every change to the index. Instead, each shard records every indexing operation in a transaction log called *translog*. When a document is indexed, it is added to the memory buffer and recorded in the translog. Frequent refresh operations write the documents in the memory buffer to a segment and then clear the memory buffer. Periodically, a flush performs a Lucene commit, which includes writing the segments to disk using fsync, purging the old translog, and starting a new translog. Thus, a translog contains all operations that have not yet been flushed. - -## Segment replication and remote-backed storage - -When neither segment replication nor remote-backed storage is enabled, OpenSearch uses document replication. In document replication, when a write request lands on the primary shard, the request is indexed to Lucene and stored in the translog. After this, the request is sent to the replicas, where, in turn, it is indexed to Lucene and stored in the translog for durability. - -With segment replication, segments are created on the primary shard only and then copied to all replicas. The replicas do not index requests to Lucene, but they do create and maintain a translog. - -With remote-backed storage, when a write request lands on the primary shard, the request is indexed to Lucene on the primary shard only. The corresponding translog is then uploaded to remote store. OpenSearch does not send the write request to the replicas, but rather performs a primary term validation to confirm that the request originator shard is still the primary shard. Primary term validation ensures that the acting primary shard fails if it becomes isolated and is unaware of the cluster manager electing a new primary. - -## The `index.translog.durability` translog setting - -Without remote-backed storage, indexing operations are only persisted to disk when the translog is fsynced. Therefore, any data that has not been written to disk can potentially be lost. - -The `index.translog.durability` setting controls how frequently OpenSearch fsyncs the translog to disk: - -- By default, `index.translog.durability` is set to `request`. This means that fsync happens after every request, and all acknowledged write requests persist in case of failure. - -- If you set `index.translog.durability` to `async`, fsync happens periodically at the specified `sync_interval` (5 seconds by default). The fsync operation is asynchronous, so acknowledge is sent without waiting for fsync. Consequently, all acknowledged writes since the last commit are lost in case of failure. - -With remote-backed storage, the translog is uploaded to a remote store for durability. - -`index.translog.durability` is a dynamic setting. To update it, use the following query: - -```json -PUT my_index/_settings -{ - "index" : { - "translog.durability" : "request" - } -} -``` - -## Refresh-level and request-level durability - -The remote store feature supports two levels of durability: - -- Refresh-level durability: Segment files are uploaded to remote store after every refresh. Set the `remote_store` flag to `true` to achieve refresh-level durability. Commit-level durability is inherent, and uploads are asynchronous. - - If you need to refresh an index manually, you can use the `_refresh` API. For example, to refresh the `my_index` index, use the following request: - - ```json - POST my_index/_refresh - ``` - -- Request-level durability: Translogs are uploaded before acknowledging the request. Set the `translog` flag to `true` to achieve request-level durability. In this scenario, we recommend to batch as many requests as possible in a bulk request. Batching requests will improve indexing throughput and latency compared to sending individual write requests. - -## Enable the feature flag - -There are several methods for enabling remote store feature, depending on the install type. You will also need to enable `remote_store` property when creating the index. - -Segment replication must also be enabled to use remote-backed storage. -{: .note} - -### Enable on a node using a tarball install - -The flag is toggled using a new jvm parameter that is set either in `OPENSEARCH_JAVA_OPTS` or in config/jvm.options. - -#### Option 1: Modify jvm.options - -Add the following lines to `config/jvm.options` before starting the OpenSearch process to enable the feature and its dependency: - -``` --Dopensearch.experimental.feature.replication_type.enabled=true --Dopensearch.experimental.feature.remote_store.enabled=true -``` - -Run OpenSearch - -```bash -./bin/opensearch -``` - -#### Option 2: Enable from an environment variable - -As an alternative to directly modifying `config/jvm.options`, you can define the properties by using an environment variable. This can be done in a single command when you start OpenSearch or by defining the variable with `export`. - -To add these flags in-line when starting OpenSearch: - -```bash -OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true -Dopensearch.experimental.feature.remote_store.enabled=true" ./opensearch-{{site.opensearch_version}}/bin/opensearch -``` - -If you want to define the environment variable separately, prior to running OpenSearch: - -```bash -export OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true -Dopensearch.experimental.feature.remote_store.enabled=true" -./bin/opensearch -``` - -### Enable with Docker containers - -If you're running Docker, add the following line to docker-compose.yml underneath the `opensearch-node` and `environment` section: - -````json -OPENSEARCH_JAVA_OPTS="-Dopensearch.experimental.feature.replication_type.enabled=true -Dopensearch.experimental.feature.remote_store.enabled=true" -```` - -### Enable for OpenSearch development - -To create new indexes with remote-backed storage enabled, you must first enable these features by adding the correct properties to `run.gradle` before building OpenSearch. See the [developer guide](https://github.com/opensearch-project/OpenSearch/blob/main/DEVELOPER_GUIDE.md) for information about to use how Gradle to build OpenSearch. - -Add the following properties to `run.gradle` to enable the feature: - -```bash -testClusters { - runTask { - testDistribution = 'archive' - if (numZones > 1) numberOfZones = numZones - if (numNodes > 1) numberOfNodes = numNodes - systemProperty 'opensearch.experimental.feature.replication_type.enabled', 'true' - systemProperty 'opensearch.experimental.feature.remote_store.enabled', 'true' - } -} -``` - -## Register a remote repository - -Now that your deployment is running with the feature flags enabled, the next step is to register a remote repository where backups will be stored. See [Register repository]({{site.url}}{{site.baseurl}}/opensearch/snapshots/snapshot-restore#register-repository) for more information. - -## Create an index - -Remote-backed storage is enabled for an index when it is created. This feature cannot be enabled for indexes that already exist. - -For refresh-level durability, include the `remote_store` property to enable the feature and specify a segment repository: - -```bash -curl -X PUT "https://localhost:9200/my-index?pretty" -ku admin:admin -H 'Content-Type: application/json' -d' -{ - "settings": { - "index": { - "number_of_shards": 1, - "number_of_replicas": 0, - "replication": { - "type": "SEGMENT" - }, - "remote_store": { - "enabled": true, - "repository": "segment-repo" - } - } - } -} -' -``` - -For request-level durability, in addition to the `remote_store` and segment repository, include the `translog` property and specify a translog repository: - -```bash -curl -X PUT "https://localhost:9200/my-index?pretty" -ku admin:admin -H 'Content-Type: application/json' -d' -{ - "settings": { - "index": { - "number_of_shards": 1, - "number_of_replicas": 1, - "replication": { - "type": "SEGMENT" - }, - "remote_store": { - "enabled": true, - "repository": "segment-repo", - "translog": { - "enabled": true, - "repository": "translog-repo", - "buffer_interval": "300ms" - } - } - } - } -} -' -``` - -You can have the same repository serve as both the segment repository and translog repository. -{: .note} - -As data is added to the index, it also will be continuously uploaded to remote storage in the form of segment and translog files because of refreshes, flushes, and translog fsyncs to disk. Along with data, other metadata files will be uploaded. -The `buffer_interval` setting specifies the time interval during which translog operations are buffered. Instead of uploading individual translog files, OpenSearch creates a single translog file with all the write operations received during the configured interval. Bundling translog files leads to higher throughput but also increases latency. The default `buffer_interval` value is 100 ms. - -Setting `translog.enabled` to `true` is currently an irreversible operation. -{: .warning} - -### Restoring from a backup - -To restore an index from a remote backup, such as in the event of a node failure, you must first close the index: - -```bash -curl -X POST "https://localhost:9200/my-index/_close" -ku admin:admin -``` - -Restore the index from the backup stored on the remote repository: - -```bash -curl -X POST "https://localhost:9200/_remotestore/_restore" -ku admin:admin -H 'Content-Type: application/json' -d' -{ - "indices": ["my-index"] -} -' -``` - -If the security plugin is enabled, a user must have the `cluster:admin/remotestore/restore` permission. See [Access control](/security-plugin/access-control/index/) for information about configuring user permissions. -{: .note} - -## Potential use cases - -You can use remote-backed storage for the following purposes: - -- To restore red clusters or indexes -- To recover all data up to the last acknowledged write, regardless of replica count, if `index.translog.durability` is set to `request` - -## Known limitations - -The following are known limitations of the remote-backed storage feature: - -- Writing data to a remote store can be a high-latency operation when compared to writing data on the local file system. This may impact the indexing throughput and latency. For performance benchmarking results, see [issue #6376](https://github.com/opensearch-project/OpenSearch/issues/6376). - diff --git a/_tuning-your-cluster/cluster-manager-task-throttling.md b/_tuning-your-cluster/cluster-manager-task-throttling.md deleted file mode 100644 index ace4547d44..0000000000 --- a/_tuning-your-cluster/cluster-manager-task-throttling.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -layout: default -title: Cluster manager task throttling -nav_order: 10 -has_children: false ---- - -# Cluster manager task throttling - -For many cluster state updates, such as defining a mapping or creating an index, nodes submit tasks to the cluster manager. The cluster manager maintains a pending task queue for these tasks and runs them in a single-threaded environment. When nodes send tens of thousands of resource-intensive tasks, like `put-mapping` or snapshot tasks, these tasks can pile up in the queue and flood the cluster manager. This affects the cluster manager's performance and may in turn affect the availability of the whole cluster. - -The first line of defense is to implement mechanisms in the caller nodes to avoid task overload on the cluster manager. However, even with those mechanisms in place, the cluster manager needs a built-in way to protect itself: cluster manager task throttling. - -To turn on cluster manager task throttling, you need to set throttling limits. The cluster manager uses the throttling limits to determine whether to reject a task. - -The cluster manager rejects a task based on its type. For any incoming task, the cluster manager evaluates the total number of tasks of the same type in the pending task queue. If this number exceeds the threshold for this task type, the cluster manager rejects the incoming task. Rejecting a task does not affect tasks of a different type. For example, if the cluster manager rejects a `put-mapping` task, it can still accept a subsequent `create-index` task. - -When the cluster manager rejects a task, the node performs retries with exponential backoff to resubmit the task to the cluster manager. If retries are unsuccessful within the timeout period, OpenSearch returns a cluster timeout error. - -## Setting throttling limits - -You can set throttling limits by specifying them in the `cluster_manager.throttling.thresholds` object and updating the [OpenSearch cluster settings]({{site.url}}{{site.baseurl}}/api-reference/cluster-settings). The setting is dynamic, so you can change the behavior of this feature without restarting your cluster. - -By default, throttling is disabled for all task types. -{: .note} - -The request has the following format: - -```json -PUT _cluster/settings -{ - "persistent": { - "cluster_manager.throttling.thresholds" : { - "" : { - "value" : - } - } - } -} -``` - -The following table describes the `cluster_manager.throttling.thresholds` object. - -Field Name | Description -:--- | :--- -task-type | The task type. See [supported task types](#supported-task-types) for a list of valid values. -value | The maximum number of tasks of the `task-type` type in the cluster manager's pending task queue. Default is `-1` (no task throttling). - -## Supported task types - -The following task types are supported: - -- `create-index` -- `update-settings` -- `cluster-update-settings` -- `auto-create` -- `delete-index` -- `delete-dangling-index` -- `create-data-stream` -- `remove-data-stream` -- `rollover-index` -- `index-aliases` -- `put-mapping` -- `create-index-template` -- `remove-index-template` -- `create-component-template` -- `remove-component-template` -- `create-index-template-v2` -- `remove-index-template-v2` -- `put-pipeline` -- `delete-pipeline` -- `create-persistent-task` -- `finish-persistent-task` -- `remove-persistent-task` -- `update-task-state` -- `put-script` -- `delete-script` -- `put-repository` -- `delete-repository` -- `create-snapshot` -- `delete-snapshot` -- `update-snapshot-state` -- `restore-snapshot` -- `cluster-reroute-api` - -#### Example request - -The following request sets the throttling threshold for the `put-mapping` task type to 100: - -```json -PUT _cluster/settings -{ - "persistent": { - "cluster_manager.throttling.thresholds": { - "put-mapping": { - "value": 100 - } - } - } -} -``` - -Set the threshold to `-1` to disable throttling for a task type. -{: .note} - - -