marqo-ai · farshidz · Jan 22, 2024 · Jan 21, 2024 · Jan 21, 2024 · Jan 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -144,4 +144,6 @@ src/marqo/tensor_search/test_throttle_timing.csv
 dump.rdb
 
 # VSCode
-.vscode/
+.vscode/
+
+.DS_Store
diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py
@@ -164,15 +164,12 @@ def _add_documents_unstructured(config: Config, add_docs_params: AddDocsParams,
             ids = [doc["_id"] for doc in add_docs_params.docs if "_id" in doc]
             existing_docs_dict: Dict[str, dict] = {}
             if len(ids) > 0:
-                existing_docs = get_documents_by_ids(config, marqo_index.name, ids, show_vectors=True,
-                                                     ignore_invalid_ids=True)['results']
+                existing_docs = _get_marqo_documents_by_ids(config, marqo_index.name, ids, ignore_invalid_ids=True)
                 for doc in existing_docs:
                     id = doc["_id"]
                     if id in existing_docs_dict:
                         raise errors.InternalError(f"Received duplicate documents for ID {id} from Vespa")
-                    if doc[TensorField.found]:
-                        del doc[TensorField.found]
-                        existing_docs_dict[id] = doc
+                    existing_docs_dict[id] = doc
 
                 logger.debug(f"Found {len(existing_docs_dict)} existing docs")
 
@@ -379,14 +376,15 @@ def _add_documents_unstructured(config: Config, add_docs_params: AddDocsParams,
                     ):
                         existing_doc = existing_docs_dict[doc_id]
                         current_field_contents = utils.extract_multimodal_content(existing_doc, multimodal_params)
-                        current_multimodal_params = existing_doc[unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS][
-                            field_name]
                         if (
                                 field_content == current_field_contents and
-                                current_multimodal_params == multimodal_params and
+                                unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS in existing_doc and
+                                field_name in existing_doc[unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS] and
+                                existing_doc[unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS][
+                                    field_name] == multimodal_params and
                                 field_name in existing_doc[constants.MARQO_DOC_TENSORS]
                         ):
-                            combo_chunk = f"{field_name}::{existing_doc[constants.MARQO_DOC_TENSORS][field_name][constants.MARQO_DOC_CHUNKS]}"
+                            combo_chunk = f"{field_name}::{existing_doc[constants.MARQO_DOC_TENSORS][field_name][constants.MARQO_DOC_CHUNKS][0]}"
                             combo_embeddings = existing_doc[constants.MARQO_DOC_TENSORS][field_name][
                                 constants.MARQO_DOC_EMBEDDINGS]
 
@@ -573,21 +571,15 @@ def _add_documents_structured(config: Config, add_docs_params: AddDocsParams, ma
         if add_docs_params.use_existing_tensors:
             existing_docs_dict: Dict[str, dict] = {}
             if len(doc_ids) > 0:
-                existing_docs = get_documents_by_ids(config,
-                                                     marqo_index.name,
-                                                     doc_ids,
-                                                     show_vectors=True,
-                                                     ignore_invalid_ids=True)['results']
+                existing_docs = _get_marqo_documents_by_ids(config, marqo_index.name, doc_ids, ignore_invalid_ids=True)
                 for doc in existing_docs:
                     if not isinstance(doc, dict):
                         continue
 
                     id = doc["_id"]
                     if id in existing_docs_dict:
                         raise api_exceptions.InternalError(f"Received duplicate documents for ID {id} from Vespa")
-                    if doc[TensorField.found]:
-                        del doc[TensorField.found]
-                        existing_docs_dict[id] = doc
+                    existing_docs_dict[id] = doc
 
                 logger.debug(f"Found {len(existing_docs_dict)} existing docs")
 
@@ -951,11 +943,7 @@ def translate_add_doc_response(responses: Optional[FeedBatchResponse], time_diff
         return translate_add_doc_response(index_responses, time_diff=t1 - t0)
 
 
-def get_document_by_id(
-        config: Config, index_name: str, document_id: str, show_vectors: bool = False):
-    """returns document by its ID"""
-    validation.validate_id(document_id)
-
+def _get_marqo_document_by_id(config: Config, index_name: str, document_id: str):
     marqo_index = index_meta_cache.get_index(config=config, index_name=index_name)
 
     try:
@@ -970,6 +958,16 @@ def get_document_by_id(
     vespa_index = vespa_index_factory(marqo_index)
     marqo_document = vespa_index.to_marqo_document(res.document.dict())
 
+    return marqo_document
+
+
+def get_document_by_id(
+        config: Config, index_name: str, document_id: str, show_vectors: bool = False):
+    """returns document by its ID"""
+    validation.validate_id(document_id)
+
+    marqo_document = _get_marqo_document_by_id(config, index_name, document_id)
+
     if show_vectors:
         if constants.MARQO_DOC_TENSORS in marqo_document:
             marqo_document[TensorField.tensor_facets] = _get_tensor_facets(marqo_document[constants.MARQO_DOC_TENSORS])
@@ -986,6 +984,29 @@ def get_document_by_id(
     return marqo_document
 
 
+def _get_marqo_documents_by_ids(
+        config: Config, index_name: str, document_ids, ignore_invalid_ids: bool = False
+):
+    validated_ids = []
+    for doc_id in document_ids:
+        try:
+            validated_ids.append(validation.validate_id(doc_id))
+        except api_exceptions.InvalidDocumentIdError as e:
+            if not ignore_invalid_ids:
+                raise e
+            logger.debug(f'Invalid document ID {doc_id} ignored')
+
+    if len(validated_ids) == 0:  # Can only happen when ignore_invalid_ids is True
+        return []
+
+    marqo_index = index_meta_cache.get_index(config=config, index_name=index_name)
+    batch_get = config.vespa_client.get_batch(validated_ids, marqo_index.schema_name)
+    vespa_index = vespa_index_factory(marqo_index)
+
+    return [vespa_index.to_marqo_document(response.document.dict()) for response in batch_get.responses
+            if response.status == 200]
+
+
 def get_documents_by_ids(
         config: Config, index_name: str, document_ids: typing.Collection[str],
         show_vectors: bool = False, ignore_invalid_ids: bool = False
@@ -1033,9 +1054,18 @@ def get_documents_by_ids(
             marqo_document = vespa_index.to_marqo_document(response.document.dict())
 
             if show_vectors:
-                marqo_document[TensorField.tensor_facets] = _get_tensor_facets(
-                    marqo_document[constants.MARQO_DOC_TENSORS])
-            del marqo_document[constants.MARQO_DOC_TENSORS]
+                if constants.MARQO_DOC_TENSORS in marqo_document:
+                    marqo_document[TensorField.tensor_facets] = _get_tensor_facets(
+                        marqo_document[constants.MARQO_DOC_TENSORS])
+                else:
+                    marqo_document[TensorField.tensor_facets] = []
+
+            if not show_vectors:
+                if unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS in marqo_document:
+                    del marqo_document[unstructured_common.MARQO_DOC_MULTIMODAL_PARAMS]
+
+            if constants.MARQO_DOC_TENSORS in marqo_document:
+                del marqo_document[constants.MARQO_DOC_TENSORS]
 
             to_return['results'].append(
                 {

diff --git a/src/marqo/vespa/vespa_client.py b/src/marqo/vespa/vespa_client.py
@@ -37,7 +37,7 @@ def __init__(self, current_generation: int, wanted_generation: int, converged: b
             self.converged = converged
 
     def __init__(self, config_url: str, document_url: str, query_url: str,
-                 content_cluster_name: str, pool_size: int = 10,):
+                 content_cluster_name: str, pool_size: int = 10):
         """
         Create a VespaClient object.
         Args:
@@ -60,11 +60,12 @@ def close(self):
         """
         self.http_client.close()
 
-    def deploy_application(self, application: str):
+    def deploy_application(self, application: str, timeout: int = 60) -> None:
         """
         Deploy a Vespa application.
         Args:
             application: Path to the Vespa application root directory
+            timeout: Timeout in seconds
         """
         endpoint = f'{self.config_url}/application/v2/tenant/default/prepareandactivate'
 
@@ -73,7 +74,8 @@ def deploy_application(self, application: str):
         response = self.http_client.post(
             endpoint,
             headers={'Content-Type': 'application/x-gzip'},
-            data=gzip_stream.read()
+            data=gzip_stream.read(),
+            timeout=timeout
         )
 
         self._raise_for_status(response)

diff --git a/tests/marqo_test.py b/tests/marqo_test.py
@@ -166,7 +166,7 @@ def structured_marqo_index_request(
     def unstructured_marqo_index_request(
             cls,
             name: Optional[str] = None,
-            model: Model = Model(name='hf/all_datasets_v4_MiniLM-L6'),
+            model: Model = Model(name='random/small'),
             normalize_embeddings: bool = True,
             text_preprocessing: TextPreProcessing = TextPreProcessing(
                 split_length=2,

diff --git a/tests/tensor_search/integ_tests/test_search_unstructured.py b/tests/tensor_search/integ_tests/test_search_unstructured.py
@@ -24,7 +24,9 @@ class TestSearchUnstructured(MarqoTestCase):
     def setUpClass(cls) -> None:
         super().setUpClass()
 
-        default_text_index = cls.unstructured_marqo_index_request()
+        default_text_index = cls.unstructured_marqo_index_request(
+            model=Model(name='hf/all_datasets_v4_MiniLM-L6')
+        )
         default_text_index_encoded_name = cls.unstructured_marqo_index_request(
             name='a-b_' + str(uuid.uuid4()).replace('-', '')
         )
@@ -1132,7 +1134,7 @@ def test_lexical_search_no_highlights_format(self):
             add_docs_params=AddDocsParams(
                 index_name=self.default_text_index,
                 docs=docs,
-                tensor_fields = []
+                tensor_fields=[]
             )
         )
         lexical_search_result = tensor_search.search(
@@ -1171,5 +1173,5 @@ def test_tensor_search_highlights_format(self):
         for hit in tensor_search_result['hits']:
             self.assertIn("_highlights", hit)
             self.assertTrue(isinstance(hit["_highlights"], list))
-            self.assertEqual(1, len(hit["_highlights"])) # We only have 1 highlight now
-            self.assertTrue(isinstance(hit["_highlights"][0], dict))
+            self.assertEqual(1, len(hit["_highlights"]))  # We only have 1 highlight now
+            self.assertTrue(isinstance(hit["_highlights"][0], dict))