From 7cf2153f82b74ae3174948d9c29784c353981c60 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Mon, 20 Nov 2023 11:13:37 -0800 Subject: [PATCH 1/6] Add knnvector as new workload Create new workload to benchmark performacne of knn_vector field type. Added unit test and procedure for notrain. Signed-off-by: Vijayan Balasubramanian --- knnvector/README.md | 144 +++++++++++++++++++++++ knnvector/__init__.py | 5 + knnvector/indices/nmslib-index.json | 41 +++++++ knnvector/operations/default.json | 21 ++++ knnvector/params/nmslib-sift-128-l2.json | 23 ++++ knnvector/runners.py | 43 +++++++ knnvector/test_procedures/default.json | 62 ++++++++++ knnvector/workload.json | 17 +++ knnvector/workload.py | 11 ++ 9 files changed, 367 insertions(+) create mode 100644 knnvector/README.md create mode 100644 knnvector/__init__.py create mode 100644 knnvector/indices/nmslib-index.json create mode 100644 knnvector/operations/default.json create mode 100644 knnvector/params/nmslib-sift-128-l2.json create mode 100644 knnvector/runners.py create mode 100644 knnvector/test_procedures/default.json create mode 100644 knnvector/workload.json create mode 100644 knnvector/workload.py diff --git a/knnvector/README.md b/knnvector/README.md new file mode 100644 index 00000000..12726b40 --- /dev/null +++ b/knnvector/README.md @@ -0,0 +1,144 @@ +# KNN Vector Workload + +This workload is to benchmark performance of indexing and search of Vector Engine of Opensearch. + +## Datasets + +This workload currently supports datasets with either HDF5 format or Big-ann. +You can download datasets from [here](http://corpus-texmex.irisa.fr/) to benchmark the quality of approximate k-NN algorithm from +OpenSearch. + +## Current Procedures + +### No Train Test + +The No Train Test procedure is used to test `knn_vector` indices that do not use an algorithm that requires training. + +#### Parameters + +This workload allows the following parameters to be specified using `--workload-params`: + +| Name | Description | +|-----------------------------------------|--------------------------------------------------------------------------| +| target_index_name | Name of index to add vectors to | +| target_field_name | Name of field to add vectors to | +| target_index_body | Path to target index definition | +| target_index_primary_shards | Target index primary shards | +| target_index_replica_shards | Target index replica shards | +| target_index_dimension | Dimension of target index | +| target_index_space_type | Target index space type | +| target_index_bulk_size | Target index bulk size | +| target_index_bulk_index_data_set_format | Format of vector data set | +| target_index_bulk_index_data_set_path | Path to vector data set | +| target_index_bulk_index_clients | Clients to be used for bulk ingestion (must be divisor of data set size) | +| target_index_max_num_segments | Number of segments to merge target index down to before beginning search | +| target_index_force_merge_timeout | Timeout for of force merge requests in seconds | +| hnsw_ef_search | HNSW ef search parameter | +| hnsw_ef_construction | HNSW ef construction parameter | +| hnsw_m | HNSW m parameter | +| query_k | The number of neighbors to return for the search | +| query_clients | Number of clients to use for running queries | +| query_data_set_format | Format of vector data set for queries | +| query_data_set_path | Path to vector data set for queries | +| query_count | Number of queries for search operation | + + + +#### Metrics + +The result metrics of this procedure will look like: +``` +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|-------------------:|------------:|-------:| +| Cumulative indexing time of primary shards | | 0.00946667 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 0.00298333 | min | +| Max cumulative indexing time across primary shards | | 0.00336667 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 0 | min | +| Cumulative merge count of primary shards | | 0 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0 | min | +| Max cumulative merge time across primary shards | | 0 | min | +| Cumulative merge throttle time of primary shards | | 0 | min | +| Min cumulative merge throttle time across primary shards | | 0 | min | +| Median cumulative merge throttle time across primary shards | | 0 | min | +| Max cumulative merge throttle time across primary shards | | 0 | min | +| Cumulative refresh time of primary shards | | 0.00861667 | min | +| Cumulative refresh count of primary shards | | 33 | | +| Min cumulative refresh time across primary shards | | 0 | min | +| Median cumulative refresh time across primary shards | | 0.00268333 | min | +| Max cumulative refresh time across primary shards | | 0.00291667 | min | +| Cumulative flush time of primary shards | | 0.000183333 | min | +| Cumulative flush count of primary shards | | 2 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 0 | min | +| Max cumulative flush time across primary shards | | 0.000183333 | min | +| Total Young Gen GC time | | 0.075 | s | +| Total Young Gen GC count | | 17 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 0.00869293 | GB | +| Translog size | | 2.56114e-07 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 9 | | +| Min Throughput | custom-vector-bulk | 25527 | docs/s | +| Mean Throughput | custom-vector-bulk | 25527 | docs/s | +| Median Throughput | custom-vector-bulk | 25527 | docs/s | +| Max Throughput | custom-vector-bulk | 25527 | docs/s | +| 50th percentile latency | custom-vector-bulk | 36.3095 | ms | +| 90th percentile latency | custom-vector-bulk | 52.2662 | ms | +| 100th percentile latency | custom-vector-bulk | 68.6513 | ms | +| 50th percentile service time | custom-vector-bulk | 36.3095 | ms | +| 90th percentile service time | custom-vector-bulk | 52.2662 | ms | +| 100th percentile service time | custom-vector-bulk | 68.6513 | ms | +| error rate | custom-vector-bulk | 0 | % | +| Min Throughput | prod-queries | 211.26 | ops/s | +| Mean Throughput | prod-queries | 213.85 | ops/s | +| Median Throughput | prod-queries | 213.48 | ops/s | +| Max Throughput | prod-queries | 216.49 | ops/s | +| 50th percentile latency | prod-queries | 3.43393 | ms | +| 90th percentile latency | prod-queries | 4.01881 | ms | +| 99th percentile latency | prod-queries | 5.56238 | ms | +| 99.9th percentile latency | prod-queries | 9.95666 | ms | +| 99.99th percentile latency | prod-queries | 39.7922 | ms | +| 100th percentile latency | prod-queries | 62.415 | ms | +| 50th percentile service time | prod-queries | 3.43405 | ms | +| 90th percentile service time | prod-queries | 4.0191 | ms | +| 99th percentile service time | prod-queries | 5.56316 | ms | +| 99.9th percentile service time | prod-queries | 9.95666 | ms | +| 99.99th percentile service time | prod-queries | 39.7922 | ms | +| 100th percentile service time | prod-queries | 62.415 | ms | +| error rate | prod-queries | 0 | % | + + +--------------------------------- +[INFO] SUCCESS (took 119 seconds) +--------------------------------- + +``` + + +### Custom Runners + +Custom runners are defined in [runners.py](runners.py). + +| Syntax | Description | Parameters | +|--------------------|-----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------| +| warmup-knn-indices | Warm up knn indices with retry until success. | 1. index - name of index to warmup | diff --git a/knnvector/__init__.py b/knnvector/__init__.py new file mode 100644 index 00000000..ff4fd04d --- /dev/null +++ b/knnvector/__init__.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. diff --git a/knnvector/indices/nmslib-index.json b/knnvector/indices/nmslib-index.json new file mode 100644 index 00000000..d115e9f7 --- /dev/null +++ b/knnvector/indices/nmslib-index.json @@ -0,0 +1,41 @@ +{ + "settings": { + "index": { + "knn": true + {%- if target_index_primary_shards is defined and target_index_primary_shards %} + ,"number_of_shards": {{ target_index_primary_shards }} + {%- endif %} + {%- if target_index_replica_shards is defined and target_index_replica_shards %} + ,"number_of_replicas": {{ target_index_replica_shards }} + {%- endif %} + {%- if hnsw_ef_search is defined and hnsw_ef_search %} + ,"knn.algo_param.ef_search": {{ hnsw_ef_search }} + {%- endif %} + } + }, + "mappings": { + "dynamic": "strict", + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": {{ target_index_dimension }}, + "method": { + "name": "hnsw", + "space_type": "{{ target_index_space_type }}", + "engine": "nmslib", + "parameters": { + {%- if hnsw_ef_construction is defined and hnsw_ef_construction %} + "ef_construction": {{ hnsw_ef_construction }} + {%- endif %} + {%- if hnsw_m is defined and hnsw_m %} + {%- if hnsw_ef_construction is defined and hnsw_ef_construction %} + , + {%- endif %} + "m": {{ hnsw_m }} + {%- endif %} + } + } + } + } + } + } diff --git a/knnvector/operations/default.json b/knnvector/operations/default.json new file mode 100644 index 00000000..02265249 --- /dev/null +++ b/knnvector/operations/default.json @@ -0,0 +1,21 @@ +{ + "name": "warmup-indices", + "operation-type": "warmup-knn-indices", + "index": "{{ target_index_name | default('target_index') }}", + "include-in-results_publishing": false +}, +{ + "name": "force-merge", + "operation-type": "force-merge", + "request-timeout": {{ target_index_force_merge_timeout | default(7200) }}, + "index": "{{ target_index_name | default('target_index') }}", + "mode": "polling", + "max-num-segments": {{ target_index_max_num_segments | default(1) }}, + "include-in-results_publishing": false +}, +{ + "name": "refresh-target-index", + "operation-type": "refresh", + "retries": 100, + "index": "{{ target_index_name | default('target_index') }}" +} diff --git a/knnvector/params/nmslib-sift-128-l2.json b/knnvector/params/nmslib-sift-128-l2.json new file mode 100644 index 00000000..6c461eda --- /dev/null +++ b/knnvector/params/nmslib-sift-128-l2.json @@ -0,0 +1,23 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/nmslib-index.json", + "target_index_primary_shards": 1, + "target_index_dimension": 128, + "target_index_space_type": "l2", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "target_index_bulk_index_clients": 10, + + "target_index_max_num_segments": 10, + "target_index_force_merge_timeout": 45.0, + "hnsw_ef_search": 100, + "hnsw_ef_construction": 100, + "query_k": 100, + + "query_data_set_format": "hdf5", + "query_data_set_path":"/tmp/sift-128-euclidean.hdf5", + "query_count": 100 + } diff --git a/knnvector/runners.py b/knnvector/runners.py new file mode 100644 index 00000000..d0e5a96c --- /dev/null +++ b/knnvector/runners.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +import logging + +from opensearchpy.exceptions import ConnectionTimeout +from osbenchmark.worker_coordinator.runner import Retry, Runner + +from osbenchmark.utils.parse import parse_int_parameter, parse_string_parameter + + +def register(registry): + # Warm up api is idempotent, so we can safely retry until complete. This is required + # so that search can perform without any initial load penalties + registry.register_runner( + WarmupIndicesRunner.RUNNER_NAME, Retry(WarmupIndicesRunner(), retry_until_success=True), async_runner=True + ) + + +class WarmupIndicesRunner(Runner): + """ + WarmupIndicesRunner loads all the native library files for all of the + shards (primaries and replicas) of all the indexes. + """ + RUNNER_NAME = "warmup-knn-indices" + + async def __call__(self, opensearch, params): + index = parse_string_parameter("index", params) + method = "GET" + warmup_url = "/_plugins/_knn/warmup/{}".format(index) + result = {'success': False} + response = await opensearch.transport.perform_request(method, warmup_url) + if response is None or response['_shards'] is None: + return result + if response['_shards']['failed'] == 0: + result['success'] = True + return result + + def __repr__(self, *args, **kwargs): + return self.RUNNER_NAME diff --git a/knnvector/test_procedures/default.json b/knnvector/test_procedures/default.json new file mode 100644 index 00000000..e0e7cfc0 --- /dev/null +++ b/knnvector/test_procedures/default.json @@ -0,0 +1,62 @@ +{ + "name": "no-train-test", + "description": "Index vector search that do not use an algorithm that requires training.", + "default": true, + "schedule": [ + { + "operation": { + "name": "delete-target-index", + "operation-type": "delete-index", + "only-if-exists": true, + "index": "{{ target_index_name | default('target_index') }}" + } + }, + { + "operation": { + "name": "create-target-index", + "operation-type": "create-index", + "index": "{{ target_index_name | default('target_index') }}" + } + }, + { + "operation": { + "name": "custom-vector-bulk", + "operation-type": "bulk-vector-data-set", + "index": "{{ target_index_name | default('target_index') }}", + "field": "{{ target_field_name | default('target_field') }}", + "bulk_size": {{ target_index_bulk_size | default(500)}}, + "data_set_format": "{{ target_index_bulk_index_data_set_format | default('hdf5') }}", + "data_set_path": "{{ target_index_bulk_index_data_set_path }}", + "num_vectors": {{ target_index_num_vectors | default(-1) }}, + "id-field-name": "{{ id_field_name }}" + }, + "clients": {{ target_index_bulk_index_clients | default(1)}} + }, + { + "name" : "refresh-target-index", + "operation" : "refresh-target-index" + }, + { + "name" : "force-merge-segments", + "operation" : "force-merge" + }, + { + "name" : "warmup-indices", + "operation" : "warmup-indices", + "index": "{{ target_index_name | default('target_index') }}" + }, + { + "operation": { + "name": "prod-queries", + "operation-type": "vector-search", + "index": "{{ target_index_name | default('target_index') }}", + "k": {{ query_k | default(100) }}, + "field" : "{{ target_field_name | default('target_field') }}", + "data_set_format" : "{{ query_data_set_format | default('hdf5') }}", + "data_set_path" : "{{ query_data_set_path | default('/tmp/vector-dataset.hdf5') }}", + "num_vectors" : {{ query_count | default(-1) }}, + "id-field-name": "{{ id_field_name }}" + } + } + ] +} diff --git a/knnvector/workload.json b/knnvector/workload.json new file mode 100644 index 00000000..c4f938cd --- /dev/null +++ b/knnvector/workload.json @@ -0,0 +1,17 @@ +{% import "benchmark.helpers" as benchmark with context %} +{ + "version": 2, + "description": "Benchmark for knn_vector field type", + "indices": [ + { + "name": "{{ target_index_name }}", + "body": "{{ target_index_body }}" + } + ], + "operations": [ + {{ benchmark.collect(parts="operations/*.json") }} + ], + "test_procedures": [ + {{ benchmark.collect(parts="test_procedures/*.json") }} + ] +} diff --git a/knnvector/workload.py b/knnvector/workload.py new file mode 100644 index 00000000..6f51e524 --- /dev/null +++ b/knnvector/workload.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from .runners import register as register_runners + + +def register(registry): + register_runners(registry) From 91ac30ff827b24f26d94966a8bee9fe9a821815e Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Tue, 2 Jan 2024 22:31:21 -0800 Subject: [PATCH 2/6] Update README Update readme to include how to execute this workload. Signed-off-by: Vijayan Balasubramanian --- knnvector/README.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/knnvector/README.md b/knnvector/README.md index 12726b40..05924114 100644 --- a/knnvector/README.md +++ b/knnvector/README.md @@ -8,6 +8,37 @@ This workload currently supports datasets with either HDF5 format or Big-ann. You can download datasets from [here](http://corpus-texmex.irisa.fr/) to benchmark the quality of approximate k-NN algorithm from OpenSearch. +### Running a benchmark + +Before running a benchmark, make sure you have the endpoint of your cluster and + the machine you are running the benchmarks from, can access it. + Additionally, ensure that all data has been pulled to the client. + +Currently, we support one test procedures for the k-NN workload: +no-train-test that does not have steps to train a model included in the +schedule. This test procedures will index a data set +of vectors into an OpenSearch index and then run a set of queries against them. + +To run test procedure, open up +[params/no-train-params.json](params/no-train-params.json) and +fill out the mandatory parameters. +Once the parameters are set, set the URL and PORT of your cluster and run the +command to run the test procedure. + +``` +export URL= +export PORT= +export PARAMS_FILE= +export PROCEDURE="no-train-test" + +opensearch-benchmark execute_test \ + --target-hosts $URL:$PORT \ + --workload-path ./workload.json \ + --workload-params ${PARAMS_FILE} \ + --test-procedure=${PROCEDURE} \ + --pipeline benchmark-only +``` + ## Current Procedures ### No Train Test From a4fe7c9fb54c7b9db09ad8e3418a5b00705af74d Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Wed, 3 Jan 2024 13:55:52 -0800 Subject: [PATCH 3/6] Add new param file faiss enginge Added new param file to index/search vector search using faiss as engine type Signed-off-by: Vijayan Balasubramanian --- knnvector/indices/faiss-index.json | 41 +++++++++++++++++++++++++ knnvector/params/faiss-sift-128-l2.json | 23 ++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 knnvector/indices/faiss-index.json create mode 100644 knnvector/params/faiss-sift-128-l2.json diff --git a/knnvector/indices/faiss-index.json b/knnvector/indices/faiss-index.json new file mode 100644 index 00000000..0f093d20 --- /dev/null +++ b/knnvector/indices/faiss-index.json @@ -0,0 +1,41 @@ +{ + "settings": { + "index": { + "knn": true + {%- if target_index_primary_shards is defined and target_index_primary_shards %} + ,"number_of_shards": {{ target_index_primary_shards }} + {%- endif %} + {%- if target_index_replica_shards is defined and target_index_replica_shards %} + ,"number_of_replicas": {{ target_index_replica_shards }} + {%- endif %} + {%- if hnsw_ef_search is defined and hnsw_ef_search %} + ,"knn.algo_param.ef_search": {{ hnsw_ef_search }} + {%- endif %} + } + }, + "mappings": { + "dynamic": "strict", + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": {{ target_index_dimension }}, + "method": { + "name": "hnsw", + "space_type": "{{ target_index_space_type }}", + "engine": "faiss", + "parameters": { + {%- if hnsw_ef_construction is defined and hnsw_ef_construction %} + "ef_construction": {{ hnsw_ef_construction }} + {%- endif %} + {%- if hnsw_m is defined and hnsw_m %} + {%- if hnsw_ef_construction is defined and hnsw_ef_construction %} + , + {%- endif %} + "m": {{ hnsw_m }} + {%- endif %} + } + } + } + } + } + } diff --git a/knnvector/params/faiss-sift-128-l2.json b/knnvector/params/faiss-sift-128-l2.json new file mode 100644 index 00000000..259994cb --- /dev/null +++ b/knnvector/params/faiss-sift-128-l2.json @@ -0,0 +1,23 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/faiss-index.json", + "target_index_primary_shards": 1, + "target_index_dimension": 128, + "target_index_space_type": "l2", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "target_index_bulk_index_clients": 10, + + "target_index_max_num_segments": 10, + "target_index_force_merge_timeout": 45.0, + "hnsw_ef_search": 100, + "hnsw_ef_construction": 100, + "query_k": 100, + + "query_data_set_format": "hdf5", + "query_data_set_path":"/tmp/sift-128-euclidean.hdf5", + "query_count": 100 + } From 5320f3ff7d72b9883cac3c43777a62ef0c0b03a6 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Mon, 8 Jan 2024 17:48:28 -0800 Subject: [PATCH 4/6] Rename knnvector to vectorsearch Signed-off-by: Vijayan Balasubramanian --- {knnvector => vectorsearch}/README.md | 27 ++++++++++--------- {knnvector => vectorsearch}/__init__.py | 0 .../indices/faiss-index.json | 0 .../indices/nmslib-index.json | 0 .../operations/default.json | 0 .../params/faiss-sift-128-l2.json | 0 .../params/nmslib-sift-128-l2.json | 0 {knnvector => vectorsearch}/runners.py | 0 .../test_procedures/default.json | 0 {knnvector => vectorsearch}/workload.json | 0 {knnvector => vectorsearch}/workload.py | 0 11 files changed, 14 insertions(+), 13 deletions(-) rename {knnvector => vectorsearch}/README.md (93%) rename {knnvector => vectorsearch}/__init__.py (100%) rename {knnvector => vectorsearch}/indices/faiss-index.json (100%) rename {knnvector => vectorsearch}/indices/nmslib-index.json (100%) rename {knnvector => vectorsearch}/operations/default.json (100%) rename {knnvector => vectorsearch}/params/faiss-sift-128-l2.json (100%) rename {knnvector => vectorsearch}/params/nmslib-sift-128-l2.json (100%) rename {knnvector => vectorsearch}/runners.py (100%) rename {knnvector => vectorsearch}/test_procedures/default.json (100%) rename {knnvector => vectorsearch}/workload.json (100%) rename {knnvector => vectorsearch}/workload.py (100%) diff --git a/knnvector/README.md b/vectorsearch/README.md similarity index 93% rename from knnvector/README.md rename to vectorsearch/README.md index 05924114..a22c8c19 100644 --- a/knnvector/README.md +++ b/vectorsearch/README.md @@ -1,4 +1,4 @@ -# KNN Vector Workload +# Vector Search Workload This workload is to benchmark performance of indexing and search of Vector Engine of Opensearch. @@ -14,28 +14,26 @@ Before running a benchmark, make sure you have the endpoint of your cluster and the machine you are running the benchmarks from, can access it. Additionally, ensure that all data has been pulled to the client. -Currently, we support one test procedures for the k-NN workload: +Currently, we support one test procedures for the vector search workload: no-train-test that does not have steps to train a model included in the schedule. This test procedures will index a data set of vectors into an OpenSearch index and then run a set of queries against them. -To run test procedure, open up -[params/no-train-params.json](params/no-train-params.json) and -fill out the mandatory parameters. -Once the parameters are set, set the URL and PORT of your cluster and run the -command to run the test procedure. +Due to the number of parameters this workload offers, it's recommended to create a parameters file and feed that +into the command line. Users are welcome to use the example param files, +`faiss-sift-128-l2.json`or `nmslib-sift-128-l2.json` in `/params`, as references. + +To run the workload, invoke the following command with the params file. ``` export URL= export PORT= export PARAMS_FILE= -export PROCEDURE="no-train-test" -opensearch-benchmark execute_test \ - --target-hosts $URL:$PORT \ - --workload-path ./workload.json \ +opensearch-benchmark execute-test \ + --target-hosts $URL:$PORT \ + --workload vectorsearch \ --workload-params ${PARAMS_FILE} \ - --test-procedure=${PROCEDURE} \ --pipeline benchmark-only ``` @@ -43,7 +41,10 @@ opensearch-benchmark execute_test \ ### No Train Test -The No Train Test procedure is used to test `knn_vector` indices that do not use an algorithm that requires training. +The No Train Test procedure is used to test vector search indices which requires no training. +You can define the underlying configuration of the vector search algorithm like specific engine, space type, etc... as +method definition . Check [vector search method definitions]([https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions) +for more details. #### Parameters diff --git a/knnvector/__init__.py b/vectorsearch/__init__.py similarity index 100% rename from knnvector/__init__.py rename to vectorsearch/__init__.py diff --git a/knnvector/indices/faiss-index.json b/vectorsearch/indices/faiss-index.json similarity index 100% rename from knnvector/indices/faiss-index.json rename to vectorsearch/indices/faiss-index.json diff --git a/knnvector/indices/nmslib-index.json b/vectorsearch/indices/nmslib-index.json similarity index 100% rename from knnvector/indices/nmslib-index.json rename to vectorsearch/indices/nmslib-index.json diff --git a/knnvector/operations/default.json b/vectorsearch/operations/default.json similarity index 100% rename from knnvector/operations/default.json rename to vectorsearch/operations/default.json diff --git a/knnvector/params/faiss-sift-128-l2.json b/vectorsearch/params/faiss-sift-128-l2.json similarity index 100% rename from knnvector/params/faiss-sift-128-l2.json rename to vectorsearch/params/faiss-sift-128-l2.json diff --git a/knnvector/params/nmslib-sift-128-l2.json b/vectorsearch/params/nmslib-sift-128-l2.json similarity index 100% rename from knnvector/params/nmslib-sift-128-l2.json rename to vectorsearch/params/nmslib-sift-128-l2.json diff --git a/knnvector/runners.py b/vectorsearch/runners.py similarity index 100% rename from knnvector/runners.py rename to vectorsearch/runners.py diff --git a/knnvector/test_procedures/default.json b/vectorsearch/test_procedures/default.json similarity index 100% rename from knnvector/test_procedures/default.json rename to vectorsearch/test_procedures/default.json diff --git a/knnvector/workload.json b/vectorsearch/workload.json similarity index 100% rename from knnvector/workload.json rename to vectorsearch/workload.json diff --git a/knnvector/workload.py b/vectorsearch/workload.py similarity index 100% rename from knnvector/workload.py rename to vectorsearch/workload.py From 48b20ffc2947d24e2386368a4712765cf62c82f8 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Tue, 9 Jan 2024 11:09:03 -0800 Subject: [PATCH 5/6] Add lucene engine Signed-off-by: Vijayan Balasubramanian --- vectorsearch/README.md | 2 +- vectorsearch/indices/lucene-index.json | 41 +++++++++++++++++++++ vectorsearch/params/lucene-sift-128-l2.json | 23 ++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 vectorsearch/indices/lucene-index.json create mode 100644 vectorsearch/params/lucene-sift-128-l2.json diff --git a/vectorsearch/README.md b/vectorsearch/README.md index a22c8c19..8d019933 100644 --- a/vectorsearch/README.md +++ b/vectorsearch/README.md @@ -21,7 +21,7 @@ of vectors into an OpenSearch index and then run a set of queries against them. Due to the number of parameters this workload offers, it's recommended to create a parameters file and feed that into the command line. Users are welcome to use the example param files, -`faiss-sift-128-l2.json`or `nmslib-sift-128-l2.json` in `/params`, as references. +`faiss-sift-128-l2.json`, `nmslib-sift-128-l2.json`, or `lucene-sift-128-l2.json` in `/params`, as references. To run the workload, invoke the following command with the params file. diff --git a/vectorsearch/indices/lucene-index.json b/vectorsearch/indices/lucene-index.json new file mode 100644 index 00000000..041cc416 --- /dev/null +++ b/vectorsearch/indices/lucene-index.json @@ -0,0 +1,41 @@ +{ + "settings": { + "index": { + "knn": true + {%- if target_index_primary_shards is defined and target_index_primary_shards %} + ,"number_of_shards": {{ target_index_primary_shards }} + {%- endif %} + {%- if target_index_replica_shards is defined and target_index_replica_shards %} + ,"number_of_replicas": {{ target_index_replica_shards }} + {%- endif %} + {%- if hnsw_ef_search is defined and hnsw_ef_search %} + ,"knn.algo_param.ef_search": {{ hnsw_ef_search }} + {%- endif %} + } + }, + "mappings": { + "dynamic": "strict", + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": {{ target_index_dimension }}, + "method": { + "name": "hnsw", + "space_type": "{{ target_index_space_type }}", + "engine": "lucene", + "parameters": { + {%- if hnsw_ef_construction is defined and hnsw_ef_construction %} + "ef_construction": {{ hnsw_ef_construction }} + {%- endif %} + {%- if hnsw_m is defined and hnsw_m %} + {%- if hnsw_ef_construction is defined and hnsw_ef_construction %} + , + {%- endif %} + "m": {{ hnsw_m }} + {%- endif %} + } + } + } + } + } + } diff --git a/vectorsearch/params/lucene-sift-128-l2.json b/vectorsearch/params/lucene-sift-128-l2.json new file mode 100644 index 00000000..a42ea9c3 --- /dev/null +++ b/vectorsearch/params/lucene-sift-128-l2.json @@ -0,0 +1,23 @@ +{ + "target_index_name": "target_index", + "target_field_name": "target_field", + "target_index_body": "indices/lucene-index.json", + "target_index_primary_shards": 1, + "target_index_dimension": 128, + "target_index_space_type": "l2", + + "target_index_bulk_size": 100, + "target_index_bulk_index_data_set_format": "hdf5", + "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", + "target_index_bulk_index_clients": 10, + + "target_index_max_num_segments": 10, + "target_index_force_merge_timeout": 45.0, + "hnsw_ef_search": 100, + "hnsw_ef_construction": 100, + "query_k": 100, + + "query_data_set_format": "hdf5", + "query_data_set_path":"/tmp/sift-128-euclidean.hdf5", + "query_count": 100 + } From 6835e959cb2748b0cd2e7f242a8081092afa2f26 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Wed, 17 Jan 2024 15:44:19 -0800 Subject: [PATCH 6/6] fix code review comments Signed-off-by: Vijayan Balasubramanian --- vectorsearch/README.md | 40 +++++++++++---------- vectorsearch/params/faiss-sift-128-l2.json | 2 +- vectorsearch/params/lucene-sift-128-l2.json | 2 +- vectorsearch/params/nmslib-sift-128-l2.json | 2 +- vectorsearch/runners.py | 4 +-- vectorsearch/test_procedures/default.json | 4 +-- vectorsearch/workload.json | 2 +- 7 files changed, 29 insertions(+), 27 deletions(-) diff --git a/vectorsearch/README.md b/vectorsearch/README.md index 8d019933..d460a472 100644 --- a/vectorsearch/README.md +++ b/vectorsearch/README.md @@ -1,40 +1,40 @@ # Vector Search Workload -This workload is to benchmark performance of indexing and search of Vector Engine of Opensearch. +This workload is to benchmark performance of indexing and search of Vector Engine of OpenSearch. ## Datasets -This workload currently supports datasets with either HDF5 format or Big-ann. +This workload currently supports datasets with either HDF5 format or Big-ANN. You can download datasets from [here](http://corpus-texmex.irisa.fr/) to benchmark the quality of approximate k-NN algorithm from OpenSearch. ### Running a benchmark -Before running a benchmark, make sure you have the endpoint of your cluster and - the machine you are running the benchmarks from, can access it. - Additionally, ensure that all data has been pulled to the client. +Before running a benchmark, ensure that the load generation host is able to access your cluster endpoint and that the +appropriate dataset is available on the host. -Currently, we support one test procedures for the vector search workload: -no-train-test that does not have steps to train a model included in the -schedule. This test procedures will index a data set -of vectors into an OpenSearch index and then run a set of queries against them. +Currently, we support only one test procedure for the vector search workload. This is named no-train-test and does not include the steps required to train the model being used. +This test procedures will index a data set of vectors into an OpenSearch cluster and then run a set of queries against the generated index. -Due to the number of parameters this workload offers, it's recommended to create a parameters file and feed that -into the command line. Users are welcome to use the example param files, -`faiss-sift-128-l2.json`, `nmslib-sift-128-l2.json`, or `lucene-sift-128-l2.json` in `/params`, as references. +Due to the number of parameters this workload offers, it's recommended to create a parameter file that specifies the desired workload +parameters instead of listing them all on the OSB command line. Users are welcome to use the example param files, +`faiss-sift-128-l2.json`, `nmslib-sift-128-l2.json`, or `lucene-sift-128-l2.json` in `/params`, as references. Here, we named +the parameter file using a format `---.json` To run the workload, invoke the following command with the params file. ``` -export URL= -export PORT= +# OpenSearch Cluster End point url with hostname and port +export ENDPOINT= +# Absolute file path of Workload param file export PARAMS_FILE= opensearch-benchmark execute-test \ - --target-hosts $URL:$PORT \ + --target-hosts $ENDPOINT \ --workload vectorsearch \ --workload-params ${PARAMS_FILE} \ - --pipeline benchmark-only + --pipeline benchmark-only \ + --kill-running-processes ``` ## Current Procedures @@ -76,9 +76,11 @@ This workload allows the following parameters to be specified using `--workload- -#### Metrics +#### Sample Output + +The output of a sample test run is provided below. Metrics are captured in the result's data store as usual, and this can be configured to be +either in-memory, or an external OpenSearch cluster. -The result metrics of this procedure will look like: ``` ------------------------------------------------------ _______ __ _____ @@ -169,7 +171,7 @@ The result metrics of this procedure will look like: ### Custom Runners -Custom runners are defined in [runners.py](runners.py). +Currently, there is only one custom runner defined in [runners.py](runners.py). | Syntax | Description | Parameters | |--------------------|-----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------| diff --git a/vectorsearch/params/faiss-sift-128-l2.json b/vectorsearch/params/faiss-sift-128-l2.json index 259994cb..84b8f66f 100644 --- a/vectorsearch/params/faiss-sift-128-l2.json +++ b/vectorsearch/params/faiss-sift-128-l2.json @@ -9,7 +9,7 @@ "target_index_bulk_size": 100, "target_index_bulk_index_data_set_format": "hdf5", "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", - "target_index_bulk_index_clients": 10, + "target_index_bulk_indexing_clients": 10, "target_index_max_num_segments": 10, "target_index_force_merge_timeout": 45.0, diff --git a/vectorsearch/params/lucene-sift-128-l2.json b/vectorsearch/params/lucene-sift-128-l2.json index a42ea9c3..c962f1e9 100644 --- a/vectorsearch/params/lucene-sift-128-l2.json +++ b/vectorsearch/params/lucene-sift-128-l2.json @@ -9,7 +9,7 @@ "target_index_bulk_size": 100, "target_index_bulk_index_data_set_format": "hdf5", "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", - "target_index_bulk_index_clients": 10, + "target_index_bulk_indexing_clients": 10, "target_index_max_num_segments": 10, "target_index_force_merge_timeout": 45.0, diff --git a/vectorsearch/params/nmslib-sift-128-l2.json b/vectorsearch/params/nmslib-sift-128-l2.json index 6c461eda..81576a91 100644 --- a/vectorsearch/params/nmslib-sift-128-l2.json +++ b/vectorsearch/params/nmslib-sift-128-l2.json @@ -9,7 +9,7 @@ "target_index_bulk_size": 100, "target_index_bulk_index_data_set_format": "hdf5", "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5", - "target_index_bulk_index_clients": 10, + "target_index_bulk_indexing_clients": 10, "target_index_max_num_segments": 10, "target_index_force_merge_timeout": 45.0, diff --git a/vectorsearch/runners.py b/vectorsearch/runners.py index d0e5a96c..ccf2571d 100644 --- a/vectorsearch/runners.py +++ b/vectorsearch/runners.py @@ -35,8 +35,8 @@ async def __call__(self, opensearch, params): response = await opensearch.transport.perform_request(method, warmup_url) if response is None or response['_shards'] is None: return result - if response['_shards']['failed'] == 0: - result['success'] = True + status = response['_shards']['failed'] == 0 + result['success'] = status return result def __repr__(self, *args, **kwargs): diff --git a/vectorsearch/test_procedures/default.json b/vectorsearch/test_procedures/default.json index e0e7cfc0..b11bb5d4 100644 --- a/vectorsearch/test_procedures/default.json +++ b/vectorsearch/test_procedures/default.json @@ -1,6 +1,6 @@ { "name": "no-train-test", - "description": "Index vector search that do not use an algorithm that requires training.", + "description": "Index vector search which does not use an algorithm that requires training.", "default": true, "schedule": [ { @@ -30,7 +30,7 @@ "num_vectors": {{ target_index_num_vectors | default(-1) }}, "id-field-name": "{{ id_field_name }}" }, - "clients": {{ target_index_bulk_index_clients | default(1)}} + "clients": {{ target_index_bulk_indexing_clients | default(1)}} }, { "name" : "refresh-target-index", diff --git a/vectorsearch/workload.json b/vectorsearch/workload.json index c4f938cd..0a7345ad 100644 --- a/vectorsearch/workload.json +++ b/vectorsearch/workload.json @@ -1,7 +1,7 @@ {% import "benchmark.helpers" as benchmark with context %} { "version": 2, - "description": "Benchmark for knn_vector field type", + "description": "Benchmark vector search engine performance for different engine types like faiss, lucene and nmslib", "indices": [ { "name": "{{ target_index_name }}",