Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add extra hybrid search RRF duplicate tests #250

Merged
merged 21 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 52 additions & 44 deletions tests/cloud_test_logic/cloud_test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,19 @@ class CloudTestIndex(str, Enum):
Please try to keep names short to avoid hitting name-length limits

We create 3 unstructured indexes and 3 structured indexes to test:
1) unstructured_text: a basic text-only index with default settings.
2) unstructured_image: an image-compatible index with GPU inference pod and performance storage class.
3) unstructured_text_custom_prepro: a text-only index with custom model and text preprocessing, with 1 replica.
4) structured_image_prepro: a structured index with image-compatible models with image preprocessing
5) structured_image_custom: a structured index with custom image-compatible models using 2 inference pods
6) structured_text: a text-only index with balanced storage class and 2 shards.
1) unstructured_text: Text-only index using hf/e5-base-v2, 2 shards, 1 replica, CPU, balanced storage, for hybrid duplicates testing.
2) unstructured_image: Image-compatible index using open_clip/ViT-B-32/laion2b_s34b_b79k, 1 shard, no replicas, CPU, basic storage.
3) unstructured_no_model: 512-dimension custom vectors, 1 shard, no replicas, CPU, basic storage.
4) structured_text: Structured text index with hf/e5-base-v2, lexical search, 2 shards, 1 replica, CPU, balanced storage.
5) structured_image: Structured image-text index with open_clip/ViT-B-32, 2 shards, 1 replica, CPU, balanced storage, with image preprocessing.
For more information on the settings of each index, please refer to index_name_to_settings_mappings.

FOR CLOUD REPLICAS AND SHARDS:
- Use unstructured_text, structured_text, or structured_images for 1 replica & 2 shards
- Use all other indexes for 0 replicas & 1 shard

We design these indexes to maximize the coverage of different settings and features. For each test method,
we will have to manually specify which index to use.

For example,
1) You want to test text fields without text preprocessing
-> use 1) unstructured_text or 6) structured_text
2) You want to test image fields without image preprocessing
-> use 2) unstructured_image or 5) structured_image_custom
3) You want to test text fields with text preprocessing
-> 3) use unstructured_text_custom_prepro
4) You want to test image fields with image preprocessing
-> 4) use structured_image_prepro
"""

unstructured_text = "pymarqo_unstr_txt"
Expand All @@ -48,21 +41,60 @@ class CloudTestIndex(str, Enum):
"model": "hf/e5-base-v2",

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"storageClass": "marqo.balanced",
"numberOfShards": 2,
"numberOfReplicas": 1, # For hybrid duplicates test
},
CloudTestIndex.unstructured_image: {
"type": "unstructured",
"treatUrlsAndPointersAsImages": True,
"model": "open_clip/ViT-B-32/laion2b_s34b_b79k",

"inferenceType": "marqo.GPU",
"storageClass": "marqo.performance",
"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"numberOfShards": 1,
"numberOfReplicas": 0,
},
CloudTestIndex.unstructured_no_model: {
"type": "unstructured",
"treatUrlsAndPointersAsImages": False,

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"numberOfShards": 1,
"numberOfReplicas": 0,

"model": "no_model",
"modelProperties": {
"type": "no_model",
"dimensions": 512
},
},
CloudTestIndex.structured_text: {
"type": "structured",
"model": "hf/e5-base-v2",
"allFields": [
{"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_3", "type": "text", "features": ["lexical_search"]},
{"name": "int_field_1", "type": "int", "features": ["score_modifier"]},
{"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}],
"tensorFields": ["text_field_1", "text_field_2", "text_field_3"],

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.balanced",
"numberOfShards": 2,
"numberOfReplicas": 1, # For hybrid duplicates test
},
CloudTestIndex.structured_image: {
"type": "structured",
"model": "open_clip/ViT-B-32/laion2b_s34b_b79k",

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"storageClass": "marqo.balanced",
"numberOfShards": 2,
"numberOfReplicas": 1, # For hybrid duplicates test

"allFields": [
{"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
Expand All @@ -78,29 +110,5 @@ class CloudTestIndex(str, Enum):
"imagePreprocessing": {
"patchMethod": "simple",
}
},
CloudTestIndex.structured_text: {
"type": "structured",
"model": "hf/e5-base-v2",
"allFields": [
{"name": "text_field_1", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_2", "type": "text", "features": ["lexical_search", "filter"]},
{"name": "text_field_3", "type": "text", "features": ["lexical_search"]},
{"name": "int_field_1", "type": "int", "features": ["score_modifier"]},
{"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}],
"tensorFields": ["text_field_1", "text_field_2", "text_field_3"],
"storageClass": "marqo.balanced",
"numberOfShards": 2,
},
CloudTestIndex.unstructured_no_model: {
"type": "unstructured",
"treatUrlsAndPointersAsImages": False,
"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.basic",
"model": "no_model",
"modelProperties": {
"type": "no_model",
"dimensions": 512
},
}
}
10 changes: 5 additions & 5 deletions tests/cloud_test_logic/populate_indices_for_cloud_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@ def populate_indices():
raise Exception("Some cloud index name exceeds 32 characters limit")

for index_name, index_settings_dicts in index_name_to_settings_mappings.items():
print(f"Creating {index_name} with config: {index_settings_dicts}")
print(f"Creating {index_name} with config: {index_settings_dicts}", flush=True)
try:
print(mq.create_index(
index_name=index_name + INDEX_NAME_SEPARATOR + test_uniqueness_id,
wait_for_readiness=False,
settings_dict=index_settings_dicts
)
), flush=True
)
except MarqoWebError as e:
print(f"Attempting to create index {index_name} resulting in error {e}")
print(f"Attempting to create index {index_name} resulting in error {e}", flush=True)
raise e


Expand All @@ -57,8 +57,8 @@ def populate_indices():
mq.config.instance_mapping._refresh_urls()
time.sleep(10)
print(f"Waiting for indexes to be created. Current Mappings: "
f"{mq.config.instance_mapping._urls_mapping}")
f"{mq.config.instance_mapping._urls_mapping}", flush=True)
attempt += 1
if attempt > max_retries:
raise Exception("Timed out waiting for indexes to be created")
print(f"Populating indices took {time.time() - populate_indices_start_time} seconds")
print(f"Populating indices took {time.time() - populate_indices_start_time} seconds", flush=True)
2 changes: 2 additions & 0 deletions tests/marqo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,8 @@ def get_test_index_name(
index_name_to_return = f"{cloud_test_index_to_use.value}_{self.index_suffix}"
self.prepare_cloud_index_for_test(index_name_to_return, delete_index_documents_before_test)
else:
if open_source_test_index_name is None:
raise ValueError("open_source_test_index_name must be specified for non-cloud tests")
index_name_to_return = open_source_test_index_name
return index_name_to_return

Expand Down
37 changes: 37 additions & 0 deletions tests/v2_tests/test_hybrid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,40 @@ def test_hybrid_search_with_filter(self):

self.assertEqual(len(hybrid_res["hits"]), 1)
self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8")

def test_hybrid_search_rrf_with_replicas_has_no_duplicates(self):
"""
Tests that show that running 100 searches on indexes with 3 replicas (structured text & unstructured text)
will not have duplicates in results.
Only relevant for cloud tests.
"""

if not self.client.config.is_marqo_cloud:
self.skipTest("Test is not relevant for non-Marqo Cloud instances")

index_test_cases = [CloudTestIndex.structured_text, CloudTestIndex.unstructured_text]
for cloud_test_index_to_use in index_test_cases:
test_index_name = self.get_test_index_name(
cloud_test_index_to_use=cloud_test_index_to_use,
open_source_test_index_name=None
wanliAlex marked this conversation as resolved.
Show resolved Hide resolved
)
self.client.index(test_index_name).add_documents(
self.docs_list,
tensor_fields=["text_field_1", "text_field_2", "text_field_3"] \
if "unstr" in test_index_name else None
)

for _ in range(100):
hybrid_res = self.client.index(test_index_name).search(
"dogs",
search_method="HYBRID",
limit=10
)

# check for duplicates
hit_ids = [hit["_id"] for hit in hybrid_res["hits"]]
self.assertEqual(len(hit_ids), len(set(hit_ids)),
f"Duplicates found in results. Only {len(set(hit_ids))} unique results out of "
f"{len(hit_ids)}")


8 changes: 4 additions & 4 deletions tests/v2_tests/test_image_chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ def test_image_simple_chunking(self):
},
}

if not self.client.config.is_marqo_cloud:
self.client.create_index(self.generic_test_index_name, settings_dict=settings)

test_index_name = self.get_test_index_name(
cloud_test_index_to_use=CloudTestIndex.structured_image,
open_source_test_index_name=None
open_source_test_index_name=self.generic_test_index_name
)
if not self.client.config.is_marqo_cloud:
self.client.create_index(self.generic_test_index_name, settings_dict=settings)
test_index_name = self.generic_test_index_name
temp_file_name = 'https://avatars.githubusercontent.com/u/13092433?v=4'

img = Image.open(requests.get(temp_file_name, stream=True).raw)
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ deps =
{[testenv]deps}
pytest-html
commands =
python tests/cloud_test_logic/run_cloud_tests.py {posargs}
python -u tests/cloud_test_logic/run_cloud_tests.py {posargs}
Loading