From a0084a86d5cf797616a1f8e185eba87417edbc15 Mon Sep 17 00:00:00 2001
From: Raynor Chavez <raynorkirksonchavez@gmail.com>
Date: Wed, 18 Sep 2024 14:19:51 +0800
Subject: [PATCH] Multimodal patches (#971)

- Fix a bug where when treatUrlsAndPointersAsImages is unset and treatUrlsAndPointersAsMedia is set, Marqo returns an error where treatUrlsAndPointersAsImages cannot be False when treatUrlsAndPointersAsMedia is True
- Add new video-audio model LanguageBind/Video_V1.5_FT_Audio_FT to the model registry.
- Move languagebind tests from CPU to CUDA tests
- Change audioPreprocessing chunk length from 20 to 10
---
 .github/workflows/unit_test_200gb_CI.yml      |  1 -
 RELEASE.md                                    |  7 +++++++
 requirements.dev.txt                          |  1 +
 requirements.txt                              |  3 ++-
 .../s2_inference/model_downloading/from_hf.py |  2 +-
 src/marqo/s2_inference/model_registry.py      | 20 ++++++++++++++-----
 .../s2_inference/multimodal_model_load.py     |  5 +++++
 src/marqo/s2_inference/s2_inference.py        |  3 ++-
 .../tensor_search/models/index_settings.py    | 14 ++++++-------
 src/marqo/version.py                          |  2 +-
 .../index_management/test_get_settings.py     | 10 ++++------
 .../model_downloading/test_from_hf.py         |  2 +-
 .../test_add_documents_combined.py            |  6 ++++--
 tests/tensor_search/integ_tests/test_embed.py |  3 ++-
 .../integ_tests/test_search_combined.py       |  6 ++++--
 tests/test_documentation.py                   |  1 -
 16 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/unit_test_200gb_CI.yml b/.github/workflows/unit_test_200gb_CI.yml
index 2a3857697..3b5958fb6 100644
--- a/.github/workflows/unit_test_200gb_CI.yml
+++ b/.github/workflows/unit_test_200gb_CI.yml
@@ -136,7 +136,6 @@ jobs:
           export VESPA_CONFIG_URL=http://localhost:19071
           export VESPA_DOCUMENT_URL=http://localhost:8080
           export VESPA_QUERY_URL=http://localhost:8080
-          export MARQO_MAX_CPU_MODEL_MEMORY=15
           
           cd marqo
           export PYTHONPATH="./tests:./src:."
diff --git a/RELEASE.md b/RELEASE.md
index 345c17930..be468ee18 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,10 @@
+# Release 2.12.1
+
+## Bug fixes and minor changes
+- Fix a bug where when `treatUrlsAndPointersAsImages` is unset and `treatUrlsAndPointersAsMedia` is set, Marqo returns an error where `treatUrlsAndPointersAsImages` cannot be `False` when `treatUrlsAndPointersAsMedia` is `True` 
+- Add new video-audio model `LanguageBind/Video_V1.5_FT_Audio_FT` to the model registry.
+
+
 # Release 2.12.0
 
 ## New features
diff --git a/requirements.dev.txt b/requirements.dev.txt
index 0b8f2512e..039473af2 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -17,6 +17,7 @@ kazoo==2.10.0
 pycurl==7.45.3
 certifi==2019.11.28
 transformers==4.41.2
+huggingface-hub==0.25.0
 
 # s2_inference:
 more_itertools
diff --git a/requirements.txt b/requirements.txt
index f8e9cf0eb..12ae5a8cd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@ cachetools==5.3.1
 pynvml==11.5.0 # For cuda utilization
 readerwriterlock==1.0.9
 kazoo==2.10.0
-pycurl==7.45.3
\ No newline at end of file
+pycurl==7.45.3
+huggingface-hub==0.25.0
\ No newline at end of file
diff --git a/src/marqo/s2_inference/model_downloading/from_hf.py b/src/marqo/s2_inference/model_downloading/from_hf.py
index 97adcd2b2..1f68ec746 100644
--- a/src/marqo/s2_inference/model_downloading/from_hf.py
+++ b/src/marqo/s2_inference/model_downloading/from_hf.py
@@ -2,7 +2,7 @@
 from typing import Optional
 from huggingface_hub import hf_hub_download
 from marqo.s2_inference.logger import get_logger
-from huggingface_hub.utils._errors import RepositoryNotFoundError
+from huggingface_hub.errors import RepositoryNotFoundError
 from marqo.s2_inference.errors import ModelDownloadError
 
 logger = get_logger(__name__)
diff --git a/src/marqo/s2_inference/model_registry.py b/src/marqo/s2_inference/model_registry.py
index 2715e6413..cce092b7c 100644
--- a/src/marqo/s2_inference/model_registry.py
+++ b/src/marqo/s2_inference/model_registry.py
@@ -1995,7 +1995,17 @@ def _get_languagebind_properties() -> Dict:
             "model_size": 8,
             "supported_modalities": ["video", "audio", "language", "image"],
             "video_chunk_length": 20,
-            "audio_chunk_length": 20,
+            "audio_chunk_length": 10,
+        },
+        'LanguageBind/Video_V1.5_FT_Audio_FT': {
+            "name": "LanguageBind/Video_V1.5_FT_Audio_FT",
+            "dimensions": 768,
+            "type": "languagebind",
+            "loader": "languagebind",
+            "model_size": 5,
+            "supported_modalities": ["video", "audio", "language"],
+            "video_chunk_length": 20,
+            "audio_chunk_length": 10,
         },
         'LanguageBind/Video_V1.5_FT_Image': {
             "name": "LanguageBind/Video_V1.5_FT_Image",
@@ -2005,7 +2015,7 @@ def _get_languagebind_properties() -> Dict:
             "model_size": 5,
             "supported_modalities": ["video", "language", "image"],
             "video_chunk_length": 20,
-            "audio_chunk_length": 20,
+            "audio_chunk_length": 10,
         },
         'LanguageBind/Audio_FT_Image': {
             "name": "LanguageBind/Audio_FT_Image",
@@ -2015,7 +2025,7 @@ def _get_languagebind_properties() -> Dict:
             "model_size": 5,
             "supported_modalities": ["audio", "language", "image"],
             "video_chunk_length": 20,
-            "audio_chunk_length": 20,
+            "audio_chunk_length": 10,
         },
         'LanguageBind/Audio_FT': {
             "name": "LanguageBind/Audio_FT",
@@ -2025,7 +2035,7 @@ def _get_languagebind_properties() -> Dict:
             "model_size": 2,
             "supported_modalities": ["video", "language"],
             "video_chunk_length": 20,
-            "audio_chunk_length": 20,
+            "audio_chunk_length": 10,
         },
         'LanguageBind/Video_V1.5_FT': {
             "name": "LanguageBind/Video_V1.5_FT",
@@ -2035,7 +2045,7 @@ def _get_languagebind_properties() -> Dict:
             "model_size": 2,
             "supported_modalities": ["video", "language"],
             "video_chunk_length": 20,
-            "audio_chunk_length": 20,
+            "audio_chunk_length": 10,
         },
 
     }
diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py
index 91da39fec..5fc9d03a4 100644
--- a/src/marqo/s2_inference/multimodal_model_load.py
+++ b/src/marqo/s2_inference/multimodal_model_load.py
@@ -75,6 +75,11 @@ def _load_languagebind_model(self):
                 'audio': 'LanguageBind_Audio_FT',
                 'image': 'LanguageBind_Image',
             }
+        elif self.model_name == "LanguageBind/Video_V1.5_FT_Audio_FT":
+            self.clip_type = {
+                'video': 'LanguageBind_Video_V1.5_FT',
+                'audio': 'LanguageBind_Audio_FT',
+            }
         elif self.model_name == "LanguageBind/Video_V1.5_FT_Image":
             self.clip_type = {
                 'video': 'LanguageBind_Video_V1.5_FT',
diff --git a/src/marqo/s2_inference/s2_inference.py b/src/marqo/s2_inference/s2_inference.py
index 12ea3a89b..02d347535 100644
--- a/src/marqo/s2_inference/s2_inference.py
+++ b/src/marqo/s2_inference/s2_inference.py
@@ -482,7 +482,8 @@ def _check_memory_threshold_for_model(device: str, model_size: Union[float, int]
         raise ModelCacheManagementError(
             f"You are trying to load a model with size = `{model_size}` into device = `{device}`, which is larger than the device threshold = `{threshold}`. "
             f"Marqo CANNOT find enough space for the model. Please change the threshold by adjusting the environment variables.\n"
-            f"You can find more detailed information at `https://docs.marqo.ai/0.0.21/Advanced-Usage/configuration/`.")
+            f"Please modify the threshold by setting the environment variable `MARQO_MAX_CUDA_MODEL_MEMORY` or `MARQO_MAX_CPU_MODEL_MEMORY`."
+            f"You can find more detailed information at `https://docs.marqo.ai/latest/other-resources/guides/advanced-usage/configuration/`.")
     return (used_memory + model_size) < threshold
 
 
diff --git a/src/marqo/tensor_search/models/index_settings.py b/src/marqo/tensor_search/models/index_settings.py
index bfd4b824b..cd020bbb3 100644
--- a/src/marqo/tensor_search/models/index_settings.py
+++ b/src/marqo/tensor_search/models/index_settings.py
@@ -41,7 +41,7 @@ class IndexSettings(StrictBaseModel):
         splitOverlap=3,
     )
     audioPreprocessing: core.AudioPreProcessing = core.AudioPreProcessing(
-        splitLength=20,
+        splitLength=10,
         splitOverlap=3,
     )
     vectorNumericType: core.VectorNumericType = core.VectorNumericType.Float
@@ -58,17 +58,12 @@ def validate_url_pointer_treatment(cls, values):
         treat_as_images = values.get('treatUrlsAndPointersAsImages')
         treat_as_media = values.get('treatUrlsAndPointersAsMedia')
 
-        if treat_as_images is None:
-            treat_as_images = False
-        if treat_as_media is None:
-            treat_as_media = False
-
         if treat_as_images and not treat_as_media:
             # Deprecation warning
             import warnings
             warnings.warn("'treatUrlsAndPointersAsImages' is deprecated. Use 'treatUrlsAndPointersAsMedia' instead.", DeprecationWarning)
 
-        if not treat_as_images and treat_as_media:
+        if treat_as_images == False and treat_as_media:
             raise api_exceptions.InvalidArgError(
                 "Invalid combination: 'treatUrlsAndPointersAsImages' cannot be False when 'treatUrlsAndPointersAsMedia' is True."
             )
@@ -161,7 +156,10 @@ def to_marqo_index_request(self, index_name: str) -> MarqoIndexRequest:
             if self.treatUrlsAndPointersAsImages is None:
                 # Default value for treat_urls_and_pointers_as_images is False, but we can't set it in the model
                 # as it is not a valid parameter for structured indexes
-                self.treatUrlsAndPointersAsImages = False
+                if self.treatUrlsAndPointersAsMedia is True:
+                    self.treatUrlsAndPointersAsImages = True
+                else:
+                    self.treatUrlsAndPointersAsImages = False
             
             if self.treatUrlsAndPointersAsMedia is None:
                 # Default value for treat_urls_and_pointers_as_media is False, but we can't set it in the model
diff --git a/src/marqo/version.py b/src/marqo/version.py
index 0daf82c48..1b87bdac2 100644
--- a/src/marqo/version.py
+++ b/src/marqo/version.py
@@ -1,4 +1,4 @@
-__version__ = "2.12.0"
+__version__ = "2.12.1"
 
 def get_version() -> str:
     return f"{__version__}"
diff --git a/tests/core/index_management/test_get_settings.py b/tests/core/index_management/test_get_settings.py
index a07496526..30f31ad5d 100644
--- a/tests/core/index_management/test_get_settings.py
+++ b/tests/core/index_management/test_get_settings.py
@@ -95,7 +95,7 @@ def test_default_settings(self):
                     'textPreprocessing': {'splitLength': 2,
                                         'splitMethod': TextSplitMethod.Sentence,
                                         'splitOverlap': 0},
-                    'audioPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
+                    'audioPreprocessing': {'splitLength': 10, 'splitOverlap': 3},
                     'videoPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
                     'treatUrlsAndPointersAsImages': False,
                     'treatUrlsAndPointersAsMedia': False,
@@ -136,7 +136,7 @@ def test_default_settings(self):
                         'splitMethod': TextSplitMethod.Sentence,
                         'splitOverlap': 0
                     },
-                    'audioPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
+                    'audioPreprocessing': {'splitLength': 10, 'splitOverlap': 3},
                     'videoPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
                     'type': IndexType.Structured,
                     'vectorNumericType': VectorNumericType.Float
@@ -165,7 +165,7 @@ def test_custom_settings(self):
                     'textPreprocessing': {'splitLength': 3,
                                         'splitMethod': TextSplitMethod.Word,
                                         'splitOverlap': 1},
-                    'audioPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
+                    'audioPreprocessing': {'splitLength': 10, 'splitOverlap': 3},
                     'videoPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
                     'treatUrlsAndPointersAsImages': False,
                     'treatUrlsAndPointersAsMedia': False,
@@ -175,7 +175,6 @@ def test_custom_settings(self):
             # Get unstructured custom settings
             retrieved_index = self.config.index_management.get_index(self.unstructured_custom_index.name)
             retrieved_settings = IndexSettings.from_marqo_index(retrieved_index).dict(exclude_none=True, by_alias=True)
-            print(f"retrieved_settings: {retrieved_settings}")
             self.assertEqual(retrieved_settings, expected_unstructured_custom_settings)
         
         with self.subTest("Structured index custom settings"):
@@ -206,7 +205,7 @@ def test_custom_settings(self):
                         'splitMethod': TextSplitMethod.Word,
                         'splitOverlap': 1
                     },
-                    'audioPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
+                    'audioPreprocessing': {'splitLength': 10, 'splitOverlap': 3},
                     'videoPreprocessing': {'splitLength': 20, 'splitOverlap': 3},
                     'type': IndexType.Structured,
                     'vectorNumericType': VectorNumericType.Float
@@ -214,6 +213,5 @@ def test_custom_settings(self):
             # Get unstructured default settings
             retrieved_index = self.config.index_management.get_index(self.structured_custom_index.name)
             retrieved_settings = IndexSettings.from_marqo_index(retrieved_index).dict(exclude_none=True, by_alias=True)
-            print(f"retrieved_settings: {retrieved_settings}")
             self.assertEqual(retrieved_settings, expected_structured_custom_settings)
             
\ No newline at end of file
diff --git a/tests/s2_inference/model_downloading/test_from_hf.py b/tests/s2_inference/model_downloading/test_from_hf.py
index d799e6819..02e02cda5 100644
--- a/tests/s2_inference/model_downloading/test_from_hf.py
+++ b/tests/s2_inference/model_downloading/test_from_hf.py
@@ -3,7 +3,7 @@
 from marqo.s2_inference.errors import ModelDownloadError
 from marqo.tensor_search.models.external_apis.hf import HfAuth, HfModelLocation
 from marqo.s2_inference.model_downloading.from_hf import download_model_from_hf
-from huggingface_hub.utils._errors import RepositoryNotFoundError
+from huggingface_hub.errors import RepositoryNotFoundError
 from marqo.s2_inference.configs import ModelCache
 
 
diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py
index d5113a5fa..9d80e0fb1 100644
--- a/tests/tensor_search/integ_tests/test_add_documents_combined.py
+++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py
@@ -179,7 +179,8 @@ def test_add_document_callVectoriseWithoutPassingEnableCache(self):
                                                                "vectorise for add_documents")
                 mock_vectorise.reset_mock()
 
-    @pytest.mark.skipif(torch.cuda.is_available() is True, reason="GPU testing device needs to be investigated")
+    @pytest.mark.largemodel
+    @pytest.mark.skipif(torch.cuda.is_available() is False, reason="We skip the large model test if we don't have cuda support")
     def test_add_multimodal_single_documents(self):
         """ """
         documents = [
@@ -240,7 +241,8 @@ def test_add_multimodal_single_documents(self):
                         self.assertNotIn(embedding, embeddings, f"Duplicate embedding found in document {i}")
                         embeddings.append(embedding)
 
-    @pytest.mark.skipif(torch.cuda.is_available() is True, reason="GPU testing device needs to be investigated")
+    @pytest.mark.largemodel
+    @pytest.mark.skipif(torch.cuda.is_available() is False, reason="We skip the large model test if we don't have cuda support")
     def test_add_multimodal_field_document(self):
         multimodal_document = {
             "_id": "1_multimodal",
diff --git a/tests/tensor_search/integ_tests/test_embed.py b/tests/tensor_search/integ_tests/test_embed.py
index 5f94c1652..3ab73abe1 100644
--- a/tests/tensor_search/integ_tests/test_embed.py
+++ b/tests/tensor_search/integ_tests/test_embed.py
@@ -272,7 +272,8 @@ def test_embed_image_url_as_image_not_text(self):
                                         msg=f"Mismatch at index {i} for {index.type}")
                     
 
-    @pytest.mark.skipif(torch.cuda.is_available() is True, reason="Skip this test if we have cuda support.")
+    @pytest.mark.largemodel
+    @pytest.mark.skipif(torch.cuda.is_available() is False, reason="We skip the large model test if we don't have cuda support")
     def test_embed_languagebind(self):
         content = [
             #TestImageUrls.HIPPO_REALISTIC.value, # image
diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py
index 046bdea5d..4e2839ec0 100644
--- a/tests/tensor_search/integ_tests/test_search_combined.py
+++ b/tests/tensor_search/integ_tests/test_search_combined.py
@@ -198,7 +198,8 @@ def tearDown(self) -> None:
         super().tearDown()
         self.device_patcher.stop()
 
-    @pytest.mark.skipif(torch.cuda.is_available() is True, reason="We skip this test if we have cuda support. This model is 5gb and is very slow on g4dn.xlarge and may crash it")
+    @pytest.mark.largemodel
+    @pytest.mark.skipif(torch.cuda.is_available() is False, reason="We skip the large model test if we don't have cuda support")
     def test_search_video(self):
         documents = [
             {"video_field_1": "https://marqo-k400-video-test-dataset.s3.amazonaws.com/videos/---QUuC4vJs_000084_000094.mp4", "_id": "1"},
@@ -232,7 +233,8 @@ def test_search_video(self):
                 self.assertEqual(results['hits'][0]['_id'], "1")  # The video document should be the top result
                 self.assertGreater(results['hits'][0]['_score'], results['hits'][1]['_score'])  # Video should have higher score
 
-    @pytest.mark.skipif(torch.cuda.is_available() is True, reason="We skip this test if we have cuda support. This model is 5gb and is very slow on g4dn.xlarge and may crash it")
+    @pytest.mark.largemodel
+    @pytest.mark.skipif(torch.cuda.is_available() is False, reason="We skip the large model test if we don't have cuda support")
     def test_search_audio(self):
         documents = [
             {"video_field_1": "https://marqo-k400-video-test-dataset.s3.amazonaws.com/videos/---QUuC4vJs_000084_000094.mp4", "_id": "1"},
diff --git a/tests/test_documentation.py b/tests/test_documentation.py
index 9767a82b7..615094205 100644
--- a/tests/test_documentation.py
+++ b/tests/test_documentation.py
@@ -5,7 +5,6 @@
 
 from marqo import marqo_docs
 
-@pytest.mark.skip(reason="skipping since we have a CI pipeline for this")
 class TestDocumentation(unittest.TestCase):
     def test_urls(self):
         # Retrieve all public functions in the module