Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding recall testing to openAI track #702

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 34 additions & 39 deletions openai_vector/challenges/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,61 +42,56 @@
"retry-until-success": true,
"include-in-reporting": false
}
},
}
{# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%}
{
"name": "post-ingest-sleep",
"operation": {
"operation-type": "sleep",
"duration": {{ post_ingest_sleep_duration|default(30) }}
}
},
}
{%- endif -%}{# serverless-post-ingest-sleep-marker-end #}
{%- for i in range(p_search_ops|length) %},
{
"name": "standalone-search-knn-10-100-single-client",
"operation": "knn-search-10-100",
"warmup-iterations": 100,
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
{
"name": "standalone-knn-search-100-1000-single-client",
"operation": "knn-search-100-1000",
"warmup-iterations": 100,
{%- if p_search_ops[i][2] > 0 -%}
"name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-single-client",
"operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
{%- else -%}
"name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-single-client",
"operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
{%- endif -%},
"warmup-iterations": 1000,
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
{
"name": "standalone-search-knn-10-100-multiple-clients",
"operation": "knn-search-10-100",
"warmup-iterations": 100,
{%- if p_search_ops[i][2] > 0 -%}
"name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-multiple-clients",
"operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
{%- else -%}
"name": "standalone-search-knn-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-multiple-clients",
"operation": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
{%- endif -%},
"warmup-iterations": 1000,
"clients": {{ standalone_search_clients | default(8) | int }},
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
}
{%- endfor %},
{
"name": "standalone-search-knn-100-1000-multiple-clients",
"operation": "knn-search-100-1000",
"warmup-iterations": 100,
"clients": {{ standalone_search_clients | default(8) | int }},
"iterations": {{ standalone_search_iterations | default(10000) | int }}
},
"name": "parallel-documents-indexing-bulk",
"operation": "parallel-documents-indexing",
"warmup-time-period": 60,
"clients": {{ parallel_indexing_bulk_clients | default(1) | int }},
"target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }}
}
{%- for i in range(p_search_ops|length) %},
{
"parallel": {
"tasks": [
{
"name": "parallel-documents-indexing-bulk",
"operation": "parallel-documents-indexing",
"clients": {{ parallel_indexing_bulk_clients | default(1) | int }},
"time-period": {{ parallel_indexing_time_period | default(1800) | int }},
"target-throughput": {{ parallel_indexing_bulk_target_throughput | default(1) | int }}
},
{
"name": "parallel-documents-indexing-search-knn-10-100",
"operation": "knn-search-10-100",
"clients": {{ parallel_indexing_search_clients | default(3) | int }},
"time-period": {{ parallel_indexing_time_period | default(1800) | int }},
"target-throughput": {{ parallel_indexing_search_target_throughput | default(100) | int }}
}
]
}
{%- if p_search_ops[i][2] > 0 -%}
"operation": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
{%- else -%}
"operation": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
{%- endif -%}
}
{%- endfor %}
]
}
4 changes: 2 additions & 2 deletions openai_vector/index-vectors-only-mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"settings": {
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
{% if preload_pagecache %}
"index.store.preload": [ "vec", "vex", "vem"],
"index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"],
{% endif %}
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}}
Expand All @@ -21,7 +21,7 @@
"index": true,
"similarity": "dot_product",
"index_options": {
"type": {{ vector_index_type | default("hnsw") | tojson }}
"type": {{ vector_index_type | default("int8_hnsw") | tojson }}
}
}
}
Expand Down
29 changes: 29 additions & 0 deletions openai_vector/index-vectors-only-with-docid-mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"settings": {
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
{% if preload_pagecache %}
"index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"],
{% endif %}
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}}
{%- endif -%}{# non-serverless-index-settings-marker-end #}
},
"mappings": {
"dynamic": false,
"properties": {
"docid": {
"type": "keyword"
},
"emb": {
"type": "dense_vector",
"element_type": "float",
"dims": 1536,
"index": true,
"similarity": "dot_product",
"index_options": {
"type": {{ vector_index_type | default("int8_hnsw") | tojson }}
}
}
}
}
}
4 changes: 2 additions & 2 deletions openai_vector/index-vectors-with-text-mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"settings": {
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
{% if preload_pagecache %}
"index.store.preload": [ "vec", "vex", "vem"],
"index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"],
{% endif %}
"index.number_of_shards": {{number_of_shards | default(1)}},
"index.number_of_replicas": {{number_of_replicas | default(0)}}
Expand All @@ -26,7 +26,7 @@
"index": true,
"similarity": "dot_product",
"index_options": {
"type": {{ vector_index_type | default("hnsw") | tojson }}
"type": {{ vector_index_type | default("int8_hnsw") | tojson }}
}
}
}
Expand Down
Binary file added openai_vector/open_ai_true_top_1000.json.bz2
Binary file not shown.
31 changes: 22 additions & 9 deletions openai_vector/operations/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,31 @@
"corpora": "openai-parallel-indexing",
"bulk-size": {{parallel_indexing_bulk_size | default(500)}},
"ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}}
},
}
{%- set p_search_ops = (search_ops | default([(10, 20, 0), (10, 20, 1), (10, 20, 2), (10, 50, 1), (10, 50, 2), (10, 100, 1), (100, 120, 1), (100, 120, 2), (100, 200, 1), (100, 200, 2), (100, 500, 1), (100, 500, 2)]))%}
{%- for i in range(p_search_ops|length) %},
{
"name": "knn-search-10-100",
{%- if p_search_ops[i][2] > 0 -%}
"name": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
{%- else -%}
"name": "knn-search-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
{%- endif -%},
"operation-type": "search",
"param-source": "knn-param-source",
"k": 10,
"num-candidates": 100
"k": {{p_search_ops[i][0]}},
"num-candidates": {{p_search_ops[i][1]}},
"num-rescore": {{p_search_ops[i][2]}}
},
{
"name": "knn-search-100-1000",
"operation-type": "search",
"param-source": "knn-param-source",
"k": 100,
"num-candidates": 1000
{%- if p_search_ops[i][2] > 0 -%}
"name": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
{%- else -%}
"name": "knn-recall-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
{%- endif -%},
"operation-type": "knn-recall",
"param-source": "knn-recall-param-source",
"k": {{p_search_ops[i][0]}},
"num-candidates": {{p_search_ops[i][1]}},
"num-rescore": {{p_search_ops[i][2]}}
}
{%- endfor %}
2 changes: 1 addition & 1 deletion openai_vector/track.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"indices": [
{
"name": "openai",
"body": "index-{{ mapping_type | default("vectors-only") }}-mapping.json"
"body": "index-{{ mapping_type | default("vectors-only-with-docid") }}-mapping.json"
}
],
"corpora": [
Expand Down
146 changes: 137 additions & 9 deletions openai_vector/track.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,22 @@
import bz2
import json
import logging
import os
import statistics
from typing import Any, List

logger = logging.getLogger(__name__)
QUERIES_FILENAME: str = "queries.json.bz2"
TRUE_KNN_FILENAME: str = "open_ai_true_top_1000.json.bz2"


def compute_percentile(data: List[Any], percentile):
size = len(data)
if size <= 0:
return None
sorted_data = sorted(data)
index = int(round(percentile * size / 100)) - 1
return sorted_data[max(min(index, size - 1), 0)]


class KnnParamSource:
Expand Down Expand Up @@ -32,24 +46,138 @@ def partition(self, partition_index, total_partitions):

def params(self):
result = {"index": self._index_name, "cache": self._params.get("cache", False), "size": self._params.get("k", 10)}

result["body"] = {
num_candidates = self._params.get("num-candidates", 50)
num_rescore = self._params.get("num-rescore", 0)
query_vec = self._queries[self._iters]
knn_query = {
"knn": {
"field": "emb",
"query_vector": self._queries[self._iters],
"k": self._params.get("k", 10),
"num_candidates": self._params.get("num-candidates", 50),
},
"_source": False,
"query_vector": query_vec,
"k": result["size"],
"num_candidates": num_candidates,
}
}
if "filter" in self._params:
result["body"]["knn"]["filter"] = self._params["filter"]

knn_query["knn"]["filter"] = self._params["filter"]
if num_rescore > 0:
knn_query["knn"]["rescore_vector"] = {"num_candidates_factor": num_rescore}
result["body"] = {"query": knn_query, "_source": False}
self._iters += 1
if self._iters >= self._maxIters:
self._iters = 0
return result


class KnnVectorStore:
def __init__(self):
cwd = os.path.dirname(__file__)
self._query_nearest_neighbor_docids = []
self._queries = []
with bz2.open(os.path.join(cwd, TRUE_KNN_FILENAME), "r") as queries_file:
for docids in queries_file:
self._query_nearest_neighbor_docids.append(json.loads(docids))
with bz2.open(os.path.join(cwd, QUERIES_FILENAME), "r") as queries_file:
for vector_query in queries_file:
self._queries.append(json.loads(vector_query))

def get_query_vectors(self) -> List[List[float]]:
return self._queries

def get_neighbors_for_query(self, query_id: int, size: int) -> List[str]:
if (query_id < 0) or (query_id >= len(self._query_nearest_neighbor_docids)):
raise ValueError(f"Unknown query with id: '{query_id}' provided")
if (size < 0) or (size > len(self._query_nearest_neighbor_docids[query_id])):
raise ValueError(f"Invalid size: '{size}' provided for query with id: '{query_id}'")
return self._query_nearest_neighbor_docids[query_id][:size]


class KnnRecallParamSource:
def __init__(self, track, params, **kwargs):
if len(track.indices) == 1:
default_index = track.indices[0].name
else:
default_index = "_all"

self._index_name = params.get("index", default_index)
self._cache = params.get("cache", False)
self._params = params
self.infinite = True
cwd = os.path.dirname(__file__)

def partition(self, partition_index, total_partitions):
return self

def params(self):
return {
"index": self._index_name,
"cache": self._params.get("cache", False),
"size": self._params.get("k", 10),
"num_candidates": self._params.get("num-candidates", 50),
"num_rescore": self._params.get("num-rescore", 0),
"knn_vector_store": KnnVectorStore(),
}


# Used in tandem with the KnnRecallParamSource.
# reads the queries, executes knn search and compares the results with the true nearest neighbors
class KnnRecallRunner:
def get_knn_query(self, query_vec, k, num_candidates, num_rescore):
knn_query = {
"knn": {
"field": "emb",
"query_vector": query_vec,
"k": k,
"num_candidates": num_candidates,
}
}
if num_rescore > 0:
knn_query["knn"]["rescore_vector"] = {"num_candidates_factor": num_rescore}
return {"query": knn_query, "_source": False}

async def __call__(self, es, params):
k = params["size"]
num_candidates = params["num_candidates"]
index = params["index"]
request_cache = params["cache"]
recall_total = 0
exact_total = 0
min_recall = k
max_recall = 0

knn_vector_store: KnnVectorStore = params["knn_vector_store"]
for query_id, query_vector in enumerate(knn_vector_store.get_query_vectors()):
knn_body = self.get_knn_query(query_vector, k, num_candidates, params["num_rescore"])
knn_body["_source"] = False
knn_body["docvalue_fields"] = ["docid"]
knn_result = await es.search(
body=knn_body,
index=index,
request_cache=request_cache,
size=k,
)
knn_hits = [hit["fields"]["docid"][0] for hit in knn_result["hits"]["hits"]]
true_neighbors = knn_vector_store.get_neighbors_for_query(query_id, k)[:k]
current_recall = len(set(knn_hits).intersection(set(true_neighbors)))
recall_total += current_recall
exact_total += len(true_neighbors)
min_recall = min(min_recall, current_recall)
max_recall = max(max_recall, current_recall)
to_return = {
"avg_recall": recall_total / exact_total,
"min_recall": min_recall,
"max_recall": max_recall,
"k": k,
"num_candidates": num_candidates,
"num_rescore": params["num_rescore"],
}
logger.info(f"Recall results: {to_return}")
return to_return

def __repr__(self, *args, **kwargs):
return "knn-recall"


def register(registry):
registry.register_param_source("knn-param-source", KnnParamSource)
registry.register_param_source("knn-recall-param-source", KnnRecallParamSource)
registry.register_runner("knn-recall", KnnRecallRunner(), async_runner=True)
Loading