From da7a581250d655b73f4073194dd5422149a06e3d Mon Sep 17 00:00:00 2001 From: Pandu Oliver Kerr Date: Wed, 9 Nov 2022 19:04:47 +1100 Subject: [PATCH 1/2] passing non_tensor_fields to _batch --- src/marqo/tensor_search/tensor_search.py | 6 +-- tests/tensor_search/test_add_documents.py | 65 +++++++++++++++++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index 63586d7cc..5c2d7f990 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -254,12 +254,12 @@ def add_documents_orchestrator( raise errors.InvalidArgError("Batch size can't be less than 1!") logger.info(f"batch_size={batch_size} and processes={processes} - batching using a single process") return _batch_request(config=config, index_name=index_name, dataset=docs, device=device, - batch_size=batch_size, verbose=False) + batch_size=batch_size, verbose=False, non_tensor_fields=non_tensor_fields) def _batch_request(config: Config, index_name: str, dataset: List[dict], batch_size: int = 100, verbose: bool = True, device=None, - update_mode: str = 'replace') -> List[Dict[str, Any]]: + update_mode: str = 'replace', non_tensor_fields = []) -> List[Dict[str, Any]]: """Batch by the number of documents""" logger.info(f"starting batch ingestion in sizes of {batch_size}") @@ -280,7 +280,7 @@ def verbosely_add_docs(i, docs): res = add_documents( config=config, index_name=index_name, docs=docs, auto_refresh=False, device=device, - update_mode=update_mode + update_mode=update_mode, non_tensor_fields=non_tensor_fields ) total_batch_time = datetime.datetime.now() - t0 num_docs = len(docs) diff --git a/tests/tensor_search/test_add_documents.py b/tests/tensor_search/test_add_documents.py index 976cbe3c8..f8554e288 100644 --- a/tests/tensor_search/test_add_documents.py +++ b/tests/tensor_search/test_add_documents.py @@ -955,3 +955,68 @@ def run(): assert items[0]['result'] in ['created', 'updated'] return True assert run() + + def test_no_tensor_field_replace(self): + # test replace and update workflows + tensor_search.add_documents( + self.config, + docs=[{"_id": "123", "myfield": "mydata", "myfield2": "mydata2"}], + auto_refresh=True, index_name=self.index_name_1 + ) + tensor_search.add_documents( + self.config, + docs=[{"_id": "123", "myfield": "mydata"}], + auto_refresh=True, index_name=self.index_name_1, + non_tensor_fields=["myfield"] + ) + doc_w_facets = tensor_search.get_document_by_id( + self.config, index_name=self.index_name_1, document_id='123', show_vectors=True) + assert doc_w_facets[TensorField.tensor_facets] == [] + assert 'myfield2' not in doc_w_facets + + def test_no_tensor_field_update(self): + # test replace and update workflows + tensor_search.add_documents( + self.config, + docs=[{"_id": "123", "myfield": "mydata", "myfield2": "mydata2"}], + auto_refresh=True, index_name=self.index_name_1 + ) + tensor_search.add_documents( + self.config, + docs=[{"_id": "123", "myfield": "mydata"}], + auto_refresh=True, index_name=self.index_name_1, + non_tensor_fields=["myfield"], update_mode='update' + ) + doc_w_facets = tensor_search.get_document_by_id( + self.config, index_name=self.index_name_1, document_id='123', show_vectors=True) + assert len(doc_w_facets[TensorField.tensor_facets]) == 1 + assert 'myfield2' in doc_w_facets[TensorField.tensor_facets][0] + assert 'myfield' in doc_w_facets + assert 'myfield2' in doc_w_facets + + def test_no_tensor_field_on_empty_ix(self): + tensor_search.add_documents( + self.config, + docs=[{"_id": "123", "myfield": "mydata"}], + auto_refresh=True, index_name=self.index_name_1, + non_tensor_fields=["myfield"] + ) + doc_w_facets = tensor_search.get_document_by_id( + self.config, index_name=self.index_name_1, document_id='123', show_vectors=True) + assert doc_w_facets[TensorField.tensor_facets] == [] + assert 'myfield' in doc_w_facets + + def test_no_tensor_field_on_empty_ix_other_field(self): + tensor_search.add_documents( + self.config, + docs=[{"_id": "123", "myfield": "mydata", "myfield2": "mydata"}], + auto_refresh=True, index_name=self.index_name_1, + non_tensor_fields=["myfield"] + ) + doc_w_facets = tensor_search.get_document_by_id( + self.config, index_name=self.index_name_1, document_id='123', show_vectors=True) + assert len(doc_w_facets[TensorField.tensor_facets]) == 1 + assert 'myfield2' in doc_w_facets[TensorField.tensor_facets][0] + assert 'myfield' not in doc_w_facets[TensorField.tensor_facets][0] + assert 'myfield' in doc_w_facets + assert 'myfield2' in doc_w_facets From f97a6ee89fc2870a42a116ec9d6ac367311f2844 Mon Sep 17 00:00:00 2001 From: Pandu Oliver Kerr Date: Wed, 9 Nov 2022 19:25:11 +1100 Subject: [PATCH 2/2] minor changes --- src/marqo/tensor_search/tensor_search.py | 2 +- tests/tensor_search/test_add_documents.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index 5c2d7f990..f93fc7be6 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -259,7 +259,7 @@ def add_documents_orchestrator( def _batch_request(config: Config, index_name: str, dataset: List[dict], batch_size: int = 100, verbose: bool = True, device=None, - update_mode: str = 'replace', non_tensor_fields = []) -> List[Dict[str, Any]]: + update_mode: str = 'replace', non_tensor_fields: List[str] = []) -> List[Dict[str, Any]]: """Batch by the number of documents""" logger.info(f"starting batch ingestion in sizes of {batch_size}") diff --git a/tests/tensor_search/test_add_documents.py b/tests/tensor_search/test_add_documents.py index f8554e288..230f88248 100644 --- a/tests/tensor_search/test_add_documents.py +++ b/tests/tensor_search/test_add_documents.py @@ -11,6 +11,7 @@ from marqo.errors import IndexNotFoundError, InvalidArgError, BadRequestError from marqo.tensor_search import tensor_search, index_meta_cache, backend from tests.marqo_test import MarqoTestCase +import time class TestAddDocuments(MarqoTestCase): @@ -24,6 +25,14 @@ def setUp(self) -> None: except IndexNotFoundError as s: pass + def tearDown(self) -> None: + self.index_name_1 = "my-test-index-1" + try: + tensor_search.delete_index(config=self.config, index_name=self.index_name_1) + except IndexNotFoundError as s: + pass + + def _match_all(self, index_name, verbose=True): """Helper function""" res = requests.get( @@ -876,7 +885,7 @@ def test_put_documents_orchestrator(self): {"_id": "789", "Temp": 12.5}, ], auto_refresh=True, update_mode='update', processes=4, batch_size=1) - + time.sleep(3) updated_doc = tensor_search.get_document_by_id( config=self.config, index_name=self.index_name_1, document_id='789' )