Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch image download with invalid _id #860

Merged
merged 5 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/marqo/tensor_search/add_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import concurrent
import copy
import math
import random
from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager
from typing import ContextManager
import threading

import PIL
from PIL.ImageFile import ImageFile
Expand Down Expand Up @@ -49,11 +49,8 @@ def threaded_download_and_preprocess_images(allocated_docs: List[dict], image_re
None

"""
# TODO - We may not be handling errors in threads properly. Test introducing errors (e.g., call a method
# that doesn't exist) in this code and verify
# Generate pseudo-unique ID for thread metrics.
_id = hash("".join([d.get("_id", str(random.getrandbits(64))) for d in allocated_docs])) % 1000
_id = f"image_download.{_id}"
_id = f'image_download.{threading.get_ident()}'
TIMEOUT_SECONDS = 3
if metric_obj is None: # Occurs predominately in testing.
metric_obj = RequestMetricsStore.for_request()
Expand Down
51 changes: 50 additions & 1 deletion tests/tensor_search/integ_tests/test_add_documents_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,53 @@ def test_download_images_non_tensor_field(self):
self.assertIsInstance(image_repo[k], expected_repo_structure[k])

# Images should not be closed as they are Tensor instead of ImageType
mock_close.assert_not_called()
mock_close.assert_not_called()

def test_idErrorWhenImageDownloading(self):
"""A test ensure image download is not raising 500 error when there is an invalid _id.

Image download use the document _id to generate a unique thread id.
However, the image download happens before validate the document _id.
This test ensures that the image download does not raise a 500 error when the document _id is invalid.
"""
test_docs = [
{
"image_field_1": "https://raw.githubusercontent.com/marqo-ai/marqo/mainline"
"/examples/ImageSearchGuide/data/image1.jpg",
"text_field_1": "this is a valid image",
"_id": "1"
},
{
"image_field_1": "https://raw.githubusercontent.com/marqo-ai/marqo/mainline"
"/examples/ImageSearchGuide/data/image2.jpg",
"text_field_1": "this is a invalid image due to int id",
"_id": 2
},
{
"image_field_1": "https://raw.githubusercontent.com/marqo-ai/marqo/mainline"
"/examples/ImageSearchGuide/data/image3.jpg",
"text_field_1": "this is a invalid image due to None",
"_id": None
},
{
"image_field_1": "https://raw.githubusercontent.com/marqo-ai/marqo/mainline"
"/examples/ImageSearchGuide/data/image4.jpg",
"text_field_1": "this is a invalid image due to ",
"_id": []
}
]

for index_name in [self.unstructured_marqo_index_name, self.structured_marqo_index_name]:
tensor_fields = ["image_field_1", "text_field_1"] if index_name == self.unstructured_marqo_index_name \
else None
with self.subTest(index_name):
r = tensor_search.add_documents(config=self.config,
add_docs_params=AddDocsParams(index_name=index_name,
docs=test_docs,
tensor_fields=tensor_fields))
self.assertEqual(True, r["errors"])
self.assertEqual(4, len(r["items"]))
self.assertEqual(200, r["items"][0]["status"])
for i in range(1, 4):
self.assertEqual(400, r["items"][i]["status"])
self.assertIn("Document _id must be a string", r["items"][i]["error"])
Loading