Add knnvector as new workload

Create new workload to benchmark performacne of knn_vector field type. Added unit test and procedure for notrain. Signed-off-by: Vijayan Balasubramanian <balasvij@amazon.com>
opensearch-project · Nov 23, 2023 · 116f071 · 116f071
1 parent f749caa
commit 116f071
Showing 16 changed files with 1,578 additions and 0 deletions.
diff --git a/knnvector/README.md b/knnvector/README.md
@@ -0,0 +1,169 @@
+# KNN Vector Workload
+
+This workload is to benchmark performance of indexing and search using knn_vector as field type.
+
+## Datasets
+
+This workload currently supports datasets  with either HDF5 format or Big-ann.
+You can download datasets from [here](http://corpus-texmex.irisa.fr/) to benchmark the quality of approximate k-NN algorithm from
+OpenSearch.
+
+## Current Procedures
+
+### No Train Test
+
+The No Train Test procedure is used to test `knn_vector` indices that do not  use an algorithm that requires training.
+
+#### Parameters
+
+This workload allows the following parameters to be specified using `--workload-params`:
+
+| Name                                    | Description                                                              |
+|-----------------------------------------|--------------------------------------------------------------------------|
+| target_index_name                       | Name of index to add vectors to                                          |
+| target_field_name                       | Name of field to add vectors to                                          |
+| target_index_body                       | Path to target index definition                                          |
+| target_index_primary_shards             | Target index primary shards                                              |
+| target_index_replica_shards             | Target index replica shards                                              |
+| target_index_dimension                  | Dimension of target index                                                |
+| target_index_space_type                 | Target index space type                                                  |
+| target_index_bulk_size                  | Target index bulk size                                                   |
+| target_index_bulk_index_data_set_format | Format of vector data set                                                |
+| target_index_bulk_index_data_set_path   | Path to vector data set                                                  |
+| target_index_bulk_index_clients         | Clients to be used for bulk ingestion (must be divisor of data set size) |
+| target_index_max_num_segments           | Number of segments to merge target index down to before beginning search |
+| target_index_force_merge_timeout        | Timeout for of force merge requests in seconds                           |
+| hnsw_ef_search                          | HNSW ef search parameter                                                 |
+| hnsw_ef_construction                    | HNSW ef construction parameter                                           |
+| hnsw_m                                  | HNSW m parameter                                                         |
+| query_k                                 | The number of neighbors to return for the search                         |
+| query_clients                           | Number of clients to use for running queries                             |
+| query_data_set_format                   | Format of vector data set for queries                                    |
+| query_data_set_path                     | Path to vector data set for queries                                      |
+
+
+
+#### Metrics
+
+The result metrics of this procedure will look like: 
+```
+------------------------------------------------------
+    _______             __   _____
+   / ____(_)___  ____ _/ /  / ___/_________  ________
+  / /_  / / __ \/ __ `/ /   \__ \/ ___/ __ \/ ___/ _ \
+ / __/ / / / / / /_/ / /   ___/ / /__/ /_/ / /  /  __/
+/_/   /_/_/ /_/\__,_/_/   /____/\___/\____/_/   \___/
+------------------------------------------------------
+            
+|                                                         Metric |               Task |       Value |   Unit |
+|---------------------------------------------------------------:|-------------------:|------------:|-------:|
+|                     Cumulative indexing time of primary shards |                    |   0.0206667 |    min |
+|             Min cumulative indexing time across primary shards |                    |           0 |    min |
+|          Median cumulative indexing time across primary shards |                    |  0.00329167 |    min |
+|             Max cumulative indexing time across primary shards |                    |   0.0140833 |    min |
+|            Cumulative indexing throttle time of primary shards |                    |           0 |    min |
+|    Min cumulative indexing throttle time across primary shards |                    |           0 |    min |
+| Median cumulative indexing throttle time across primary shards |                    |           0 |    min |
+|    Max cumulative indexing throttle time across primary shards |                    |           0 |    min |
+|                        Cumulative merge time of primary shards |                    |           0 |    min |
+|                       Cumulative merge count of primary shards |                    |           0 |        |
+|                Min cumulative merge time across primary shards |                    |           0 |    min |
+|             Median cumulative merge time across primary shards |                    |           0 |    min |
+|                Max cumulative merge time across primary shards |                    |           0 |    min |
+|               Cumulative merge throttle time of primary shards |                    |           0 |    min |
+|       Min cumulative merge throttle time across primary shards |                    |           0 |    min |
+|    Median cumulative merge throttle time across primary shards |                    |           0 |    min |
+|       Max cumulative merge throttle time across primary shards |                    |           0 |    min |
+|                      Cumulative refresh time of primary shards |                    |   0.0328667 |    min |
+|                     Cumulative refresh count of primary shards |                    |          22 |        |
+|              Min cumulative refresh time across primary shards |                    |           0 |    min |
+|           Median cumulative refresh time across primary shards |                    |    0.001825 |    min |
+|              Max cumulative refresh time across primary shards |                    |   0.0292167 |    min |
+|                        Cumulative flush time of primary shards |                    |     0.00025 |    min |
+|                       Cumulative flush count of primary shards |                    |           3 |        |
+|                Min cumulative flush time across primary shards |                    |           0 |    min |
+|             Median cumulative flush time across primary shards |                    |           0 |    min |
+|                Max cumulative flush time across primary shards |                    |     0.00025 |    min |
+|                                        Total Young Gen GC time |                    |       0.058 |      s |
+|                                       Total Young Gen GC count |                    |           6 |        |
+|                                          Total Old Gen GC time |                    |           0 |      s |
+|                                         Total Old Gen GC count |                    |           0 |        |
+|                                                     Store size |                    |   0.0148059 |     GB |
+|                                                  Translog size |                    | 2.04891e-07 |     GB |
+|                                         Heap used for segments |                    |           0 |     MB |
+|                                       Heap used for doc values |                    |           0 |     MB |
+|                                            Heap used for terms |                    |           0 |     MB |
+|                                            Heap used for norms |                    |           0 |     MB |
+|                                           Heap used for points |                    |           0 |     MB |
+|                                    Heap used for stored fields |                    |           0 |     MB |
+|                                                  Segment count |                    |           8 |        |
+|                                                 Min Throughput | custom-vector-bulk |      9961.8 | docs/s |
+|                                                Mean Throughput | custom-vector-bulk |      9961.8 | docs/s |
+|                                              Median Throughput | custom-vector-bulk |      9961.8 | docs/s |
+|                                                 Max Throughput | custom-vector-bulk |      9961.8 | docs/s |
+|                                        50th percentile latency | custom-vector-bulk |     26.8407 |     ms |
+|                                        90th percentile latency | custom-vector-bulk |     34.4197 |     ms |
+|                                       100th percentile latency | custom-vector-bulk |      48.418 |     ms |
+|                                   50th percentile service time | custom-vector-bulk |     26.8407 |     ms |
+|                                   90th percentile service time | custom-vector-bulk |     34.4197 |     ms |
+|                                  100th percentile service time | custom-vector-bulk |      48.418 |     ms |
+|                                                     error rate | custom-vector-bulk |           0 |      % |
+|                                                 Min Throughput |       prod-queries |     1572.08 |  ops/s |
+|                                                Mean Throughput |       prod-queries |     1749.01 |  ops/s |
+|                                              Median Throughput |       prod-queries |     1742.79 |  ops/s |
+|                                                 Max Throughput |       prod-queries |     1890.22 |  ops/s |
+|                                        50th percentile latency |       prod-queries |     4.26518 |     ms |
+|                                        90th percentile latency |       prod-queries |     6.81978 |     ms |
+|                                        99th percentile latency |       prod-queries |     12.2256 |     ms |
+|                                      99.9th percentile latency |       prod-queries |     26.3252 |     ms |
+|                                     99.99th percentile latency |       prod-queries |     37.1685 |     ms |
+|                                       100th percentile latency |       prod-queries |     38.4632 |     ms |
+|                                   50th percentile service time |       prod-queries |     4.26518 |     ms |
+|                                   90th percentile service time |       prod-queries |     6.81978 |     ms |
+|                                   99th percentile service time |       prod-queries |     12.2256 |     ms |
+|                                 99.9th percentile service time |       prod-queries |     26.3252 |     ms |
+|                                99.99th percentile service time |       prod-queries |     37.1685 |     ms |
+|                                  100th percentile service time |       prod-queries |     38.4632 |     ms |
+|                                                     error rate |       prod-queries |           0 |      % |
+
+
+--------------------------------
+[INFO] SUCCESS (took 60 seconds)
+--------------------------------
+
+```
+
+
+### Custom Parameter Sources
+
+Custom parameter sources are defined in [params_sources.py](params_sources.py).
+
+| Name                    | Description                                                            | Parameters                                                                                                                                                                                                                                                                                                                                                |
+|-------------------------|------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| bulk-from-data-set      | Provides bulk payloads containing vectors from a data set for indexing | 1. data_set_format - (hdf5, bigann)<br/>2. data_set_path - path to data set<br/>3. index - name of index for bulk ingestion<br/> 4. field - field to place vector in <br/> 5. bulk_size - vectors per bulk request<br/> 6. num_vectors - number of vectors to use from the data set. Defaults to the whole data set.                                      |
+| knn-query-from-data-set | Provides a query generated from a data set                             | 1. data_set_format - (hdf5, bigann)<br/>2. data_set_path - path to data set<br/>3. index - name of index to query against<br/>4. field - field to to query against<br/>5. k - number of results to return<br/>6. dimension - size of vectors to produce<br/> 7. num_vectors - number of vectors to use from the data set. Defaults to the whole data set. |
+
+
+### Custom Runners
+
+Custom runners are defined in [runners.py](runners.py).
+
+| Syntax             | Description                                         | Parameters                                                                                                   |
+|--------------------|-----------------------------------------------------|:-------------------------------------------------------------------------------------------------------------|
+| custom-vector-bulk | Bulk index a set of vectors in an OpenSearch index. | 1. bulk-from-data-set                                                                                        |
+| custom-refresh     | Run refresh with retry capabilities.                | 1. index - name of index to refresh<br/> 2. retries - number of times to retry the operation                 |
+
+### Testing
+
+We have a set of unit tests for our extensions in 
+[tests](tests). To run all the tests, run the following 
+command:
+
+```commandline
+python -m unittest discover ./tests
+```
+
+To run an individual test:
+```commandline
+python -m unittest tests.test_param_sources.VectorsFromDataSetParamSourceTestCase.test_partition_hdf5
+```
diff --git a/knnvector/__init__.py b/knnvector/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
diff --git a/knnvector/indices/nmslib-index.json b/knnvector/indices/nmslib-index.json
@@ -0,0 +1,41 @@
+{
+    "settings": {
+      "index": {
+        "knn": true
+        {%- if target_index_primary_shards is defined and target_index_primary_shards %}
+        ,"number_of_shards": {{ target_index_primary_shards }}
+        {%- endif %}
+        {%- if target_index_replica_shards is defined and target_index_replica_shards %}
+        ,"number_of_replicas": {{ target_index_replica_shards }}
+        {%- endif %}
+        {%- if hnsw_ef_search is defined and hnsw_ef_search %}
+        ,"knn.algo_param.ef_search": {{ hnsw_ef_search }}
+        {%- endif %}
+      }
+    },
+    "mappings": {
+      "dynamic": "strict",
+      "properties": {
+        "target_field": {
+          "type": "knn_vector",
+          "dimension": {{ target_index_dimension }},
+          "method": {
+            "name": "hnsw",
+            "space_type": "{{ target_index_space_type }}",
+            "engine": "nmslib",
+            "parameters": {
+            {%- if hnsw_ef_construction is defined and hnsw_ef_construction %}
+            "ef_construction": {{ hnsw_ef_construction }}
+            {%- endif %}
+            {%- if hnsw_m is defined and hnsw_m %}
+            {%- if hnsw_ef_construction is defined and hnsw_ef_construction %}
+            ,
+            {%- endif %}
+            "m": {{ hnsw_m }}
+            {%- endif %}
+            }
+          }
+        }
+      }
+    }
+  }
diff --git a/knnvector/operations/default.json b/knnvector/operations/default.json
@@ -0,0 +1,23 @@
+{
+    "name": "warmup-queries",
+    "operation-type": "search",
+    "index": "{{ target_index_name | default('target_index') }}",
+    "param-source": "random-knn-query-param-source",
+    "k": {{ query_k  | default(100) }},
+    "dimension": {{ target_index_dimension | default(512) }},
+    "field": "{{ target_field_name | default('target_field') }}",
+    "min_value": {{ query_min_value  | default(-10.0) }},
+    "max_value": {{ query_max_value  | default(10.0) }},
+    "include-in-results_publishing": false
+},
+{
+    "name": "force-merge",
+    "operation-type": "force-merge",
+    "request-timeout": {{ target_index_force_merge_timeout | default(7200) }},
+    "index": "{{ target_index_name | default('target_index') }}",
+    "mode": "polling",
+{%- if target_index_max_num_segments is defined %}
+    "max-num-segments": {{ target_index_max_num_segments }},
+{%- endif %}
+    "include-in-results_publishing": false
+}
diff --git a/knnvector/params/nmslib-sift-128-l2.json b/knnvector/params/nmslib-sift-128-l2.json
@@ -0,0 +1,29 @@
+{
+    "target_index_name": "target_index",
+    "target_field_name": "target_field",
+    "target_index_body": "indices/nmslib-index.json",
+    "target_index_primary_shards": 3,
+    "target_index_dimension": 128,
+    "target_index_space_type": "l2",
+
+    "target_index_bulk_size": 200,
+    "target_index_bulk_index_data_set_format": "hdf5",
+    "target_index_bulk_index_data_set_path": "/tmp/sift-128-euclidean.hdf5",
+    "target_index_bulk_index_clients": 10,
+
+    "target_index_max_num_segments": 10,
+    "target_index_force_merge_timeout": 45.0,
+
+    "warmup_queries": 100,
+    "query_min_value": -10.0,
+    "query_max_value": 10.0,
+
+    "hnsw_ef_search": 100,
+    "hnsw_ef_construction": 100,
+    "target_index_num_vectors": 1000000,
+    "query_k": 100,
+    "query_clients": 10,
+    "query_data_set_format": "hdf5",
+    "query_data_set_path": "/tmp/sift-128-euclidean.hdf5"
+
+}