From deaecf1b0ee14e3663e7acad046e8428b7b94aae Mon Sep 17 00:00:00 2001 From: Li Wan Date: Wed, 23 Oct 2024 21:43:41 +1100 Subject: [PATCH 01/29] Finish add documents --- src/marqo/api/models/add_docs_objects.py | 20 ++++ src/marqo/core/models/add_docs_params.py | 4 +- .../s2_inference/multimodal_model_load.py | 12 +- src/marqo/tensor_search/add_docs.py | 108 ++++++++++-------- src/marqo/tensor_search/web/api_utils.py | 16 +-- 5 files changed, 100 insertions(+), 60 deletions(-) diff --git a/src/marqo/api/models/add_docs_objects.py b/src/marqo/api/models/add_docs_objects.py index 5d7a33695..2174753e6 100644 --- a/src/marqo/api/models/add_docs_objects.py +++ b/src/marqo/api/models/add_docs_objects.py @@ -22,6 +22,7 @@ class Config: tensorFields: Optional[List] = None useExistingTensors: bool = False imageDownloadHeaders: dict = Field(default_factory=dict) + mediaDownloadHeaders: Optional[dict] = None modelAuth: Optional[ModelAuth] = None mappings: Optional[dict] = None documents: Union[Sequence[Union[dict, Any]], np.ndarray] @@ -38,3 +39,22 @@ def validate_thread_counts(cls, values): if media_count is not None and image_count != read_env_vars_and_defaults_ints(EnvVars.MARQO_IMAGE_DOWNLOAD_THREAD_COUNT_PER_REQUEST): raise ValueError("Cannot set both imageDownloadThreadCount and mediaDownloadThreadCount") return values + + @root_validator(skip_on_failure=True) + def _validate_image_download_headers_and_media_download_headers(cls, values): + """Validate imageDownloadHeaders and mediaDownloadHeaders. Raise an error if both are set. + + If imageDownloadHeaders is set, set mediaDownloadHeaders to it and use mediaDownloadHeaders in the + rest of the code. + + imageDownloadHeaders is deprecated and will be removed in the future. + """ + image_download_headers = values.get('imageDownloadHeaders') + media_download_headers = values.get('mediaDownloadHeaders') + if image_download_headers and media_download_headers: + raise ValueError("Cannot set both imageDownloadHeaders and mediaDownloadHeaders. " + "The imageDownloadHeaders is deprecated and will be removed in the future. " + "Use mediaDownloadHeaders instead.") + if image_download_headers: + values['mediaDownloadHeaders'] = image_download_headers + return values diff --git a/src/marqo/core/models/add_docs_params.py b/src/marqo/core/models/add_docs_params.py index 557bf166b..66cf12185 100644 --- a/src/marqo/core/models/add_docs_params.py +++ b/src/marqo/core/models/add_docs_params.py @@ -31,7 +31,7 @@ class AddDocsParams(BaseModel): device: Device used to carry out the document update, if `None` is given, it will be determined by EnvVars.MARQO_BEST_AVAILABLE_DEVICE image_download_thread_count: number of threads used to concurrently download images - image_download_headers: headers to authenticate image download + media_download_headers: headers to authenticate media download requests mappings: a dictionary used to handle all the object field content in the doc, e.g., multimodal_combination field model_auth: an object used to authorise downloading an object from a datastore @@ -53,7 +53,7 @@ class Config: image_download_thread_count: int = Field(default_factory=lambda: read_env_vars_and_defaults_ints( EnvVars.MARQO_IMAGE_DOWNLOAD_THREAD_COUNT_PER_REQUEST)) media_download_thread_count: Optional[int] - image_download_headers: dict = Field(default_factory=dict) + media_download_headers: Optional[dict] = None use_existing_tensors: bool = False mappings: Optional[dict] = None model_auth: Optional[ModelAuth] = None diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 173630c22..0c7ed9431 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -11,10 +11,11 @@ from pydantic import BaseModel from enum import Enum from abc import ABC, abstractmethod -from typing import List, Dict, Any, Union +from typing import List, Dict, Any, Union, Optional from PIL.Image import Image import torch from urllib.parse import quote +from marqo.core.inference.image_download import DEFAULT_HEADERS from marqo.s2_inference.multimodal_model_load import * @@ -130,8 +131,9 @@ def encode(self, content, modality, **kwargs): @contextmanager -def fetch_content_sample(url, sample_size=10240): # 10 KB - response = requests.get(url, stream=True) +def fetch_content_sample(url, media_download_headers: Optional[dict] = None, sample_size=10240): # 10 KB + # It's ok to pass None to requests.get() for headers and it won't change the default headers + response = requests.get(url, stream=True, headers=media_download_headers) buffer = io.BytesIO() try: for chunk in response.iter_content(chunk_size=min(sample_size, 8192)): @@ -145,7 +147,7 @@ def fetch_content_sample(url, sample_size=10240): # 10 KB response.close() -def infer_modality(content: Union[str, List[str], bytes]) -> Modality: +def infer_modality(content: Union[str, List[str], bytes], media_download_headers: Optional[dict] = None) -> Modality: """ Infer the modality of the content. Video, audio, image or text. """ @@ -167,7 +169,7 @@ def infer_modality(content: Union[str, List[str], bytes]) -> Modality: if validate_url(encoded_url): # Use context manager to handle content sample try: - with fetch_content_sample(encoded_url) as sample: + with fetch_content_sample(encoded_url, media_download_headers) as sample: mime = magic.from_buffer(sample.read(), mime=True) if mime.startswith('image/'): return Modality.IMAGE diff --git a/src/marqo/tensor_search/add_docs.py b/src/marqo/tensor_search/add_docs.py index a8dbded3d..c87a7c78f 100644 --- a/src/marqo/tensor_search/add_docs.py +++ b/src/marqo/tensor_search/add_docs.py @@ -42,7 +42,7 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], image_download_headers: dict, device: str = None, media_field_types_mapping: Optional[Dict[str, FieldType]] = None, - download_headers: Optional[Dict] = None, # Optional for now + media_download_headers: Optional[Dict] = None, metric_obj: Optional[RequestMetrics] = None, preprocessors: Optional[Dict[str, Compose]] = None, marqo_index_type: Optional[IndexType] = None, @@ -59,7 +59,7 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], image_repo: dictionary that will be mutated by this thread. It will add PIL images as values and the URLs as keys tensor_fields: A tuple of tensor_fields. Images will be downloaded for these fields only. - image_download_headers: A dict of headers for image download. Can be used + media_download_headers: A dict of headers for image download. Can be used to authenticate image downloads force_download: If True, skip the _is_image check and download the fields as images. Side Effects: @@ -93,7 +93,7 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], continue if isinstance(doc[field], str) or force_download: try: - inferred_modality = infer_modality(doc[field]) + inferred_modality = infer_modality(doc[field], media_download_headers) except MediaDownloadError as e: if is_structured_index and media_field_types_mapping[field] == FieldType.ImagePointer: # Continue processing for structured indexes with image fields @@ -222,24 +222,24 @@ def download_and_preprocess_multimedia_content( ) -> ContextManager[dict]: thread_count = _determine_thread_count(marqo_index, add_docs_params) - media_repo = process_batch(docs=docs, - thread_count=thread_count, - tensor_fields=list(media_field_types_mapping.keys()), - media_field_types_mapping=media_field_types_mapping, - image_download_headers=add_docs_params.image_download_headers, - download_headers=None, # TODO verify if this is used - marqo_index_type=marqo_index.type, - device=add_docs_params.device, - marqo_index_model=marqo_index.model, - model_name=marqo_index.model.name, - model_properties=marqo_index.model.properties, - normalize_embeddings=marqo_index.normalize_embeddings, - model_auth=add_docs_params.model_auth, - patch_method_exists=marqo_index.image_preprocessing.patch_method is not None, - audio_preprocessing=marqo_index.audio_preprocessing, - video_preprocessing=marqo_index.video_preprocessing, - force_download=False, # TODO verify if this is used - ) + media_repo = process_batch( + docs=docs, + thread_count=thread_count, + tensor_fields=list(media_field_types_mapping.keys()), + media_field_types_mapping=media_field_types_mapping, + media_download_headers=add_docs_params.media_download_headers, + marqo_index_type=marqo_index.type, + device=add_docs_params.device, + marqo_index_model=marqo_index.model, + model_name=marqo_index.model.name, + model_properties=marqo_index.model.properties, + normalize_embeddings=marqo_index.normalize_embeddings, + model_auth=add_docs_params.model_auth, + patch_method_exists=marqo_index.image_preprocessing.patch_method is not None, + audio_preprocessing=marqo_index.audio_preprocessing, + video_preprocessing=marqo_index.video_preprocessing, + force_download=False, # TODO verify if this is used + ) try: yield media_repo @@ -293,7 +293,7 @@ def download_and_preprocess_content(docs: List[dict], thread_count: int, tensor_ model_name: str, normalize_embeddings: bool, media_field_types_mapping: Optional[Dict[str, FieldType]], - download_headers: Optional[Dict] = None, # Optional for now + media_download_headers: Optional[Dict] = None, # Optional for now model_properties: Optional[Dict] = None, model_auth: Optional[ModelAuth] = None, device: Optional[str] = None, @@ -305,11 +305,25 @@ def download_and_preprocess_content(docs: List[dict], thread_count: int, tensor_ force_download: bool = False ) -> ContextManager[dict]: media_repo = {} # for image/video/audio - media_repo = process_batch(docs, thread_count, tensor_fields, image_download_headers, - model_name, normalize_embeddings, force_download, - media_field_types_mapping, download_headers, model_properties, model_auth, - device, patch_method_exists, marqo_index_type, marqo_index_model, - audio_preprocessing, video_preprocessing) + media_repo = process_batch( + docs = docs, + thread_count = thread_count, + tensor_fields = tensor_fields, + image_download_headers = image_download_headers, + model_name = model_name, + normalize_embeddings = normalize_embeddings, + force_download = force_download, + media_field_types_mapping = media_field_types_mapping, + media_download_headers = media_download_headers, + model_properties = model_properties, + model_auth = model_auth, + device = device, + patch_method_exists = patch_method_exists, + marqo_index_type = marqo_index_type, + marqo_index_model = marqo_index_model, + audio_preprocessing = audio_preprocessing, + video_preprocessing = video_preprocessing + ) try: yield media_repo @@ -325,11 +339,13 @@ def download_and_preprocess_content(docs: List[dict], thread_count: int, tensor_ def process_batch(docs: List[dict], thread_count: int, tensor_fields: List[str], image_download_headers: dict, model_name: str, normalize_embeddings: bool, force_download: bool, media_field_types_mapping: Optional[Dict[str, FieldType]], - download_headers: Optional[Dict], model_properties: Optional[Dict], + model_properties: Optional[Dict], model_auth: Optional[ModelAuth], device: Optional[str], patch_method_exists: bool, marqo_index_type: Optional[IndexType], marqo_index_model: Optional[Model], + media_download_headers: Optional[Dict] = None, audio_preprocessing: Optional[AudioPreProcessing] = None, video_preprocessing: Optional[VideoPreProcessing] = None) -> dict: + docs_per_thread = math.ceil(len(docs) / thread_count) copied = copy.deepcopy(docs) @@ -349,25 +365,27 @@ def process_batch(docs: List[dict], thread_count: int, tensor_fields: List[str], # Consider replacing below with: # thread_allocated_docs = [copied[i: i + docs_per_thread] for i in range(0, len(copied), docs_per_thread)] thread_allocated_docs = [copied[i: i + docs_per_thread] for i in range(len(copied))[::docs_per_thread]] - download_headers = download_headers if download_headers else {} with ThreadPoolExecutor(max_workers=len(thread_allocated_docs)) as executor: - futures = [executor.submit(threaded_download_and_preprocess_content, - allocation, - media_repo, - tensor_fields, - image_download_headers, - device, - media_field_types_mapping, - download_headers, - m[i], - preprocessors, - marqo_index_type, - marqo_index_model, - audio_preprocessing, - video_preprocessing, - force_download) - for i, allocation in enumerate(thread_allocated_docs)] + futures = [ + executor.submit( + threaded_download_and_preprocess_content, + allocation, + media_repo, + tensor_fields, + image_download_headers, + device, + media_field_types_mapping, + media_download_headers, + m[i], + preprocessors, + marqo_index_type, + marqo_index_model, + audio_preprocessing, + video_preprocessing, + force_download) + for i, allocation in enumerate(thread_allocated_docs) + ] # Unhandled exceptions will be raised here. # We only raise the first exception if there are multiple exceptions diff --git a/src/marqo/tensor_search/web/api_utils.py b/src/marqo/tensor_search/web/api_utils.py index 0c2ab4d4e..1db7cea68 100644 --- a/src/marqo/tensor_search/web/api_utils.py +++ b/src/marqo/tensor_search/web/api_utils.py @@ -50,27 +50,27 @@ def translate_api_device(device: Optional[str]) -> Optional[str]: f"Acceptable device types: {acceptable_devices}") -def decode_image_download_headers(image_download_headers: Optional[str] = None) -> dict: +def decode_media_download_headers(media_download_headers: Optional[str] = None) -> dict: """Decodes an image download header string into a Python dict Args: - image_download_headers: JSON-serialised, URL encoded header dictionary + media_download_headers: JSON-serialised, URL encoded header dictionary Returns: - image_download_headers as a dict + media_download_headers as a dict Raises: InvalidArgError if there is trouble parsing the dictionary """ - if not image_download_headers: + if not media_download_headers: return dict() else: try: - as_str = urllib.parse.unquote_plus(image_download_headers) + as_str = urllib.parse.unquote_plus(media_download_headers) as_dict = json.loads(as_str) return as_dict except json.JSONDecodeError as e: - raise InvalidArgError(f"Error parsing image_download_headers. Message: {e}") + raise InvalidArgError(f"Error parsing media_download_headers. Message: {e}") def decode_query_string_model_auth(model_auth: Optional[str] = None) -> Optional[ModelAuth]: @@ -130,14 +130,14 @@ def add_docs_params_orchestrator(index_name: str, body: Union[AddDocsBodyParams, tensor_fields = body.tensorFields use_existing_tensors = body.useExistingTensors model_auth = body.modelAuth - image_download_headers = body.imageDownloadHeaders + media_download_headers = body.mediaDownloadHeaders image_download_thread_count = body.imageDownloadThreadCount text_chunk_prefix = body.textChunkPrefix return AddDocsParams( index_name=index_name, docs=docs, device=device, tensor_fields=tensor_fields, - use_existing_tensors=use_existing_tensors, image_download_headers=image_download_headers, + use_existing_tensors=use_existing_tensors, media_download_headers=media_download_headers, image_download_thread_count=image_download_thread_count, mappings=mappings, model_auth=model_auth, text_chunk_prefix=text_chunk_prefix, batch_vectorisation_mode=body.batchVectorisationMode, From 8e1bf32a99a1bef8572b638f3b4cea84c44b0738 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 01:28:44 +1100 Subject: [PATCH 02/29] Finish search --- src/marqo/s2_inference/s2_inference.py | 84 +++++++--- src/marqo/tensor_search/api.py | 2 +- src/marqo/tensor_search/models/api_models.py | 25 ++- src/marqo/tensor_search/models/search.py | 43 ++++- src/marqo/tensor_search/tensor_search.py | 157 ++++++++++--------- 5 files changed, 206 insertions(+), 105 deletions(-) diff --git a/src/marqo/s2_inference/s2_inference.py b/src/marqo/s2_inference/s2_inference.py index fc97d5300..874565efa 100644 --- a/src/marqo/s2_inference/s2_inference.py +++ b/src/marqo/s2_inference/s2_inference.py @@ -47,8 +47,28 @@ def vectorise(model_name: str, content: Union[str, List[str], List[Image], List[bytes]], model_properties: dict = None, - device: str = None, normalize_embeddings: bool = get_default_normalization(), - model_auth: ModelAuth = None, enable_cache: bool = False, modality: Modality = Modality.TEXT, **kwargs,) -> List[List[float]]: + device: str = None, + normalize_embeddings: bool = get_default_normalization(), + model_auth: ModelAuth = None, + enable_cache: bool = False, + modality: Modality = Modality.TEXT, + media_download_headers: Optional[Dict] = None, + infer: bool = False + ) -> List[List[float]]: + """Vectorise the given content using the given model. + + Args: + model_name: The name of the model to use. + content: The content to vectorise. + model_properties: The properties of the model to use. + device: The device to use. + normalize_embeddings: Whether to normalize the embeddings. + model_auth: The model authorisation details. + enable_cache: Whether to enable the inference cache. + modality: The modality of the content. + media_download_headers: The media download headers. + infer: Whether to infer the modality. Deprecated and should be replaced by modality. + """ if not device: raise InternalError(message=f"vectorise (internal function) cannot be called without setting device!") @@ -63,25 +83,37 @@ def vectorise(model_name: str, content: Union[str, List[str], List[Image], List[ model = _available_models[model_cache_key][AvailableModelsKey.model] if _marqo_inference_cache.is_enabled() and enable_cache: - return _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs) + return _vectorise_with_cache( + model, model_cache_key, content, normalize_embeddings, modality, + media_download_headers, infer + ) else: - return _vectorise_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) + return _vectorise_without_cache( + model_cache_key, content, normalize_embeddings, modality, + media_download_headers, infer + ) -def _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs): +def _vectorise_with_cache(model, model_cache_key: str, content, normalize_embeddings: bool, modality: Modality, + media_download_headers: Optional[Dict], infer: bool): if isinstance(content, str): vectorised = _marqo_inference_cache.get(model_cache_key, content) if vectorised is None: - vectorised = _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) + vectorised = _encode_without_cache( + model_cache_key, content, normalize_embeddings, modality, media_download_headers, infer + ) _marqo_inference_cache.set(model_cache_key, content, vectorised[0]) else: vectorised = _convert_cached_embeddings_to_output(vectorised) return vectorised elif isinstance(content, list): - return _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs) + return _vectorise_list_with_cache( + model, model_cache_key, content, normalize_embeddings, modality, media_download_headers, infer + ) else: raise TypeError(f"Unsupported content type: {type(content).__name__}") -def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs): +def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, + media_download_headers, infer): contents_to_vectorise = [] cached_output = [] @@ -97,7 +129,8 @@ def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embedd contents_to_vectorise.append(content_item) if contents_to_vectorise: - vectorised_outputs = _encode_without_cache(model_cache_key, contents_to_vectorise, normalize_embeddings, modality, **kwargs) + vectorised_outputs = _encode_without_cache( + model_cache_key, contents_to_vectorise, normalize_embeddings, modality, media_download_headers, infer) # Cache the vectorised outputs for content_item, vectorised_output in zip(contents_to_vectorise, vectorised_outputs): if isinstance(content_item, str): @@ -110,20 +143,32 @@ def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embedd return vectorised_outputs -def _vectorise_without_cache(model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], - normalize_embeddings: bool, modality: Modality, **kwargs) -> List[List[float]]: - return _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) -def _encode_without_cache(model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], - normalize_embeddings: bool, modality: Modality, **kwargs) -> List[List[float]]: +def _vectorise_without_cache( + model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], + normalize_embeddings: bool, modality: Modality, + media_download_headers: Optional[Dict], infer: bool +) -> List[List[float]]: + return _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, media_download_headers, infer) + +def _encode_without_cache( + model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], + normalize_embeddings: bool, modality: Modality, media_download_headers: Optional[Dict], infer: bool) \ + -> List[List[float]]: try: model = _available_models[model_cache_key][AvailableModelsKey.model] encoder = get_encoder(model) if isinstance(content, str): - vectorised = model.encode(content, normalize=normalize_embeddings, modality=modality, **kwargs) + vectorised = model.encode( + content, normalize=normalize_embeddings, modality=modality, + media_download_headers=media_download_headers, infer=infer + ) elif isinstance(content, (torch.Tensor, torch.FloatTensor)): - vectorised = model.encode(content, normalize=normalize_embeddings, modality=modality, **kwargs) + vectorised = model.encode( + content, normalize=normalize_embeddings, modality=modality, + media_download_headers=media_download_headers, infer=infer + ) else: vector_batches = [] batch_size = _get_max_vectorise_batch_size() @@ -133,9 +178,10 @@ def _encode_without_cache(model_cache_key: str, content: Union[str, List[str], L modality = infer_modality(batch[0] if isinstance(batch[0], (str, bytes)) else batch) # TODO maybe the infer parameter can be replaced by modality - infer = kwargs.pop('infer', False if modality == Modality.TEXT else True) - encoded_batch = encoder.encode(batch, modality=modality, normalize=normalize_embeddings, - infer=infer, **kwargs) + encoded_batch = encoder.encode( + batch, modality=modality, normalize=normalize_embeddings, + infer=infer, media_download_headers=media_download_headers + ) vector_batches.append(_convert_tensor_to_numpy(encoded_batch)) diff --git a/src/marqo/tensor_search/api.py b/src/marqo/tensor_search/api.py index 556ae4ed6..18d61776e 100644 --- a/src/marqo/tensor_search/api.py +++ b/src/marqo/tensor_search/api.py @@ -277,7 +277,7 @@ def search(search_query: SearchQuery, index_name: str, device: str = Depends(api reranker=search_query.reRanker, filter=search_query.filter, device=device, attributes_to_retrieve=search_query.attributesToRetrieve, boost=search_query.boost, - image_download_headers=search_query.image_download_headers, + media_download_headers = search_query.mediaDownloadHeaders, context=search_query.context, score_modifiers=search_query.scoreModifiers, model_auth=search_query.modelAuth, diff --git a/src/marqo/tensor_search/models/api_models.py b/src/marqo/tensor_search/models/api_models.py index 0c853e557..d688e55f0 100644 --- a/src/marqo/tensor_search/models/api_models.py +++ b/src/marqo/tensor_search/models/api_models.py @@ -7,7 +7,7 @@ from typing import Union, List, Dict, Optional import pydantic -from pydantic import BaseModel, root_validator, validator +from pydantic import BaseModel, root_validator, validator, Field from marqo.base_model import ImmutableStrictBaseModel from marqo.core.models.hybrid_parameters import HybridParameters @@ -47,7 +47,8 @@ class SearchQuery(BaseMarqoModel): filter: str = None attributesToRetrieve: Union[None, List[str]] = None boost: Optional[Dict] = None - image_download_headers: Optional[Dict] = None + imageDownloadHeaders: Optional[Dict] = Field(default_factory=None, alias="image_download_headers") + mediaDownloadHeaders: Optional[Dict] = None context: Optional[SearchContext] = None scoreModifiers: Optional[ScoreModifierLists] = None modelAuth: Optional[ModelAuth] = None @@ -68,6 +69,26 @@ def _preprocess_search_method(cls, value): else: return value + @root_validator(skip_on_failure=True) + def _validate_image_download_headers_and_media_download_headers(cls, values): + """Validate imageDownloadHeaders and mediaDownloadHeaders. Raise an error if both are set. + + If imageDownloadHeaders is set, set mediaDownloadHeaders to it and use mediaDownloadHeaders in the + rest of the code. + + imageDownloadHeaders is deprecated and will be removed in the future. + """ + image_download_headers = values.get('imageDownloadHeaders') + media_download_headers = values.get('mediaDownloadHeaders') + if image_download_headers and media_download_headers: + raise ValueError("Cannot set both imageDownloadHeaders(image_download_headers) and mediaDownloadHeaders. " + "The imageDownloadHeaders(image_download_headers) is deprecated and will be removed in the future. " + "Use mediaDownloadHeaders instead.") + if image_download_headers: + values['mediaDownloadHeaders'] = image_download_headers + return values + + @root_validator(pre=False, skip_on_failure=True) def validate_query_and_context(cls, values): """Validate that one of query and context are present for tensor/hybrid search, or just the query for lexical search. diff --git a/src/marqo/tensor_search/models/search.py b/src/marqo/tensor_search/models/search.py index 4d8c5a74c..0a7101eb8 100644 --- a/src/marqo/tensor_search/models/search.py +++ b/src/marqo/tensor_search/models/search.py @@ -4,7 +4,9 @@ from typing import Any, Union, List, Dict, Optional, NewType, Literal from marqo.api.exceptions import InvalidArgError +from marqo.core.models import MarqoQuery from marqo.tensor_search.models.private_models import ModelAuth +from marqo.s2_inference.multimodal_model_load import Modality Qidx = NewType('Qidx', int) # Indicates the position of a search query in a bulk search request JHash = NewType('JHash', int) # hash of a VectoriseJob. Used for quick access of VectorisedJobs @@ -26,25 +28,25 @@ class VectorisedJobs(BaseModel): content: List[Union[str, List[str]]] device: str normalize_embeddings: bool - image_download_headers: Optional[Dict] - content_type: Literal['text', 'media'] + media_download_headers: Optional[Dict] model_auth: Optional[ModelAuth] + modality: Modality def __hash__(self): return self.groupby_key() + hash(json.dumps(self.content, sort_keys=True)) def groupby_key(self) -> JHash: return VectorisedJobs.get_groupby_key(self.model_name, self.model_properties, self.device, - self.normalize_embeddings, self.content_type, - self.image_download_headers) + self.normalize_embeddings, self.modality, + self.media_download_headers) @staticmethod def get_groupby_key(model_name: str, model_properties: Dict[str, Any], device: str, - normalize_embeddings: bool, content_type: str, image_download_headers: Optional[Dict]) -> JHash: + normalize_embeddings: bool, modality: str, media_download_headers: Optional[Dict]) -> JHash: return JHash(hash(model_name) + hash(json.dumps(model_properties, sort_keys=True)) + hash(device) + hash(normalize_embeddings) - + hash(content_type) - + hash(json.dumps(image_download_headers, sort_keys=True)) + + hash(modality) + + hash(json.dumps(media_download_headers, sort_keys=True)) ) def add_content(self, content: List[Union[str, List[str]]]) -> VectorisedJobPointer: @@ -75,4 +77,29 @@ def __init__(self, **data): def check_vector_length(cls, v): if not (1 <= len(v) <= 64): raise InvalidArgError('The number of tensors must be between 1 and 64') - return v \ No newline at end of file + return v + + +class QueryContent(BaseModel): + content: str + modality: Modality + + +class QueryContentCollector(BaseModel): + queries: List[QueryContent] + @property + def text_queries(self) -> List[QueryContent]: + return [q for q in self.queries if q.modality == Modality.TEXT] + + @property + def image_queries(self) -> List[QueryContent]: + return [q for q in self.queries if q.modality == Modality.IMAGE] + + @property + def video_queries(self) -> List[QueryContent]: + return [q for q in self.queries if q.modality == Modality.VIDEO] + + @property + def audio_queries(self) -> List[QueryContent]: + return [q for q in self.queries if q.modality == Modality.AUDIO] + \ No newline at end of file diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index a15646254..3addc0e62 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -90,7 +90,7 @@ from marqo.tensor_search.models.delete_docs_objects import MqDeleteDocsRequest from marqo.tensor_search.models.private_models import ModelAuth from marqo.tensor_search.models.search import Qidx, JHash, SearchContext, VectorisedJobs, VectorisedJobPointer, \ - SearchContextTensor + SearchContextTensor, QueryContentCollector, QueryContent from marqo.tensor_search.telemetry import RequestMetricsStore from marqo.tensor_search.tensor_search_logging import get_logger from marqo.vespa.exceptions import VespaStatusError @@ -1465,7 +1465,7 @@ def search(config: Config, index_name: str, text: Optional[Union[str, dict, Cust reranker: Union[str, Dict] = None, filter: Optional[str] = None, attributes_to_retrieve: Optional[List[str]] = None, device: str = None, boost: Optional[Dict] = None, - image_download_headers: Optional[Dict] = None, + media_download_headers: Optional[Dict] = None, context: Optional[SearchContext] = None, score_modifiers: Optional[ScoreModifierLists] = None, model_auth: Optional[ModelAuth] = None, @@ -1493,7 +1493,7 @@ def search(config: Config, index_name: str, text: Optional[Union[str, dict, Cust device: May be none, we calculate default device here num_highlights: number of highlights to return for each doc boost: boosters to re-weight the scores of individual fields - image_download_headers: headers for downloading images + media_download_headers: headers to use when downloading media context: a dictionary to allow custom vectors in search, for tensor search only score_modifiers: a dictionary to modify the score based on field values, for tensor search only model_auth: Authorisation details for downloading a model (if required) @@ -1583,7 +1583,7 @@ def search(config: Config, index_name: str, text: Optional[Union[str, dict, Cust ef_search=ef_search, approximate=approximate, searchable_attributes=searchable_attributes, filter_string=filter, device=selected_device, attributes_to_retrieve=attributes_to_retrieve, boost=boost, - image_download_headers=image_download_headers, context=context, score_modifiers=score_modifiers, + media_download_headers=media_download_headers, context=context, score_modifiers=score_modifiers, model_auth=model_auth, highlights=highlights, text_query_prefix=text_query_prefix ) elif search_method.upper() == SearchMethod.HYBRID: @@ -1594,7 +1594,7 @@ def search(config: Config, index_name: str, text: Optional[Union[str, dict, Cust ef_search=ef_search, approximate=approximate, searchable_attributes=searchable_attributes, filter_string=filter, device=selected_device, attributes_to_retrieve=attributes_to_retrieve, boost=boost, - image_download_headers=image_download_headers, context=context, score_modifiers=score_modifiers, + media_download_headers=media_download_headers, context=context, score_modifiers=score_modifiers, model_auth=model_auth, highlights=highlights, text_query_prefix=text_query_prefix, hybrid_parameters=hybrid_parameters ) @@ -1735,37 +1735,39 @@ def _lexical_search( return gathered_docs -def construct_vector_input_batches(query: Optional[Union[str, Dict]], index_info: MarqoIndex) \ - -> Tuple[List[str], List[str]]: +def construct_vector_input_batches(query: Optional[Union[str, Dict]], media_download_headers: Optional[Dict] = None) \ + -> QueryContentCollector: """Splits images from text in a single query (either a query string, or dict of weighted strings). Args: query: a string query, or a dict of weighted strings. - index_info: used to determine whether URLs should be treated as images + media_download_headers: headers to use when downloading media Returns: - A tuple of string batches. The first is text content the second is image content. + A SearchQueryCollector object with the text and media content separated. """ # TODO - infer this from model - treat_urls_as_media = True - + query_content_list = [] if isinstance(query, str): - if treat_urls_as_media and validate_url(query): - return [], [query, ] - else: - return [query, ], [] + query_content_list.append( + QueryContent( + content=query, + modality=infer_modality(query, media_download_headers=media_download_headers) + ) + ) elif isinstance(query, dict): # is dict: - ordered_queries = list(query.items()) - if treat_urls_as_media: - text_queries = [k for k, _ in ordered_queries if not _is_image(k)] - image_queries = [k for k, _ in ordered_queries if _is_image(k)] - return text_queries, image_queries - else: - return [k for k, _ in ordered_queries], [] + for query, weights in query.items(): + query_content_list.append( + QueryContent( + content=query, + modality=infer_modality(query, media_download_headers=media_download_headers) + ) + ) elif query is None: - return [], [] + pass else: raise ValueError(f"Incorrect type for query: {type(query).__name__}") + return QueryContentCollector(queries = query_content_list) def gather_documents_from_response(response: QueryResult, marqo_index: MarqoIndex, highlights: bool, @@ -1800,7 +1802,7 @@ def unstructured_index_attributes_to_retrieve(marqo_doc: Dict[str, Any], attribu def assign_query_to_vector_job( q: BulkSearchQueryEntity, jobs: Dict[JHash, VectorisedJobs], - grouped_content: Tuple[List[str], List[str], List[str], List[str]], + grouped_content: QueryContentCollector, index_info: MarqoIndex, device: str) -> List[VectorisedJobPointer]: """ For a individual query, assign its content (to be vectorised) to a vector job. If none exist with the correct @@ -1819,34 +1821,39 @@ def assign_query_to_vector_job( Returns: A list of pointers to the location in a vector job that will have its vectorised content. """ - if len(grouped_content) != 2: - raise RuntimeError( - "assign_query_to_vector_job() expects param `grouped_content` with 2 elems. Instead received" - f" `grouped_content` with {len(grouped_content)} elems") ptrs = [] - for i, content in enumerate(grouped_content): - content_type = ['text', 'media'][i] - vector_job = VectorisedJobs( - model_name=index_info.model.name, - model_properties=index_info.model.get_properties(), - content=content, - device=device, - normalize_embeddings=index_info.normalize_embeddings, - image_download_headers=q.image_download_headers, - content_type=content_type, - model_auth=q.modelAuth - ) - # If exists, add content to vector job. Otherwise create new - if jobs.get(vector_job.groupby_key()) is not None: - j = jobs.get(vector_job.groupby_key()) - ptrs.append(j.add_content(content)) - else: - jobs[vector_job.groupby_key()] = vector_job - ptrs.append(VectorisedJobPointer( - job_hash=vector_job.groupby_key(), - start_idx=0, - end_idx=len(vector_job.content) - )) + content_lists_by_modality = [ + grouped_content.text_queries, + grouped_content.image_queries, + grouped_content.audio_queries, + grouped_content.video_queries, + ] + + for i, list_of_queries_by_modalities in enumerate(content_lists_by_modality): + if len(list_of_queries_by_modalities) > 0: + content: List[str] = [query.content for query in list_of_queries_by_modalities] + modality: Modality = list_of_queries_by_modalities[0].modality + vector_job = VectorisedJobs( + model_name=index_info.model.name, + model_properties=index_info.model.get_properties(), + content=content, + device=device, + normalize_embeddings=index_info.normalize_embeddings, + media_download_headers=q.mediaDownloadHeaders, + model_auth=q.modelAuth, + modality = modality + ) + # If exists, add content to vector job. Otherwise create new + if jobs.get(vector_job.groupby_key()) is not None: + j = jobs.get(vector_job.groupby_key()) + ptrs.append(j.add_content(content)) + else: + jobs[vector_job.groupby_key()] = vector_job + ptrs.append(VectorisedJobPointer( + job_hash=vector_job.groupby_key(), + start_idx=0, + end_idx=len(vector_job.content) + )) return ptrs @@ -1865,9 +1872,8 @@ def create_vector_jobs(queries: List[BulkSearchQueryEntity], config: Config, dev qidx_to_job: Dict[Qidx, List[VectorisedJobPointer]] = dict() jobs: Dict[JHash, VectorisedJobs] = {} for i, q in enumerate(queries): - q = queries[i] # split images, from text: - to_be_vectorised: Tuple[List[str], List[str]] = construct_vector_input_batches(q.q, q.index) + to_be_vectorised: QueryContentCollector = construct_vector_input_batches(q.q, q.mediaDownloadHeaders) qidx_to_job[i] = assign_query_to_vector_job(q, jobs, to_be_vectorised, q.index, device) return qidx_to_job, jobs @@ -1882,12 +1888,13 @@ def vectorise_jobs(jobs: List[VectorisedJobs]) -> Dict[JHash, Dict[str, List[flo # TODO: Handle exception for single job, and allow others to run. try: if v.content: - modality = infer_modality(v.content[0] if isinstance(v.content, list) else v.content) + modality = infer_modality(v.content[0] if isinstance(v.content, list) else v.content, + media_download_headers=v.media_download_headers) vectors = s2_inference.vectorise( model_name=v.model_name, model_properties=v.model_properties, content=v.content, device=v.device, normalize_embeddings=v.normalize_embeddings, - image_download_headers=v.image_download_headers, + media_download_headers=v.media_download_headers, model_auth=v.model_auth, enable_cache=True, modality=modality @@ -1940,11 +1947,13 @@ def get_query_vectors_from_jobs( if ordered_queries: # multiple queries. We have to weight and combine them: vectorised_ordered_queries = [ - (get_content_vector( + ( + get_content_vector( possible_jobs=qidx_to_job[qidx], jobs=jobs, job_to_vectors=job_to_vectors, - content=content), + content=content + ), weight, content ) for content, weight in ordered_queries @@ -1999,15 +2008,12 @@ def get_content_vector(possible_jobs: List[VectorisedJobPointer], job_to_vectors Raises runtime error if is not found """ - content_type = 'text' if infer_modality(content) == Modality.TEXT else 'media' - not_found_error = RuntimeError(f"get_content_vector(): could not find corresponding vector for content `{content}`") for vec_job_pointer in possible_jobs: - if jobs[vec_job_pointer.job_hash].content_type == content_type: - try: - return job_to_vectors[vec_job_pointer.job_hash][content] - except KeyError: - raise not_found_error + try: + return job_to_vectors[vec_job_pointer.job_hash][content] + except KeyError: + raise not_found_error raise not_found_error @@ -2019,19 +2025,20 @@ def add_prefix_to_queries(queries: List[BulkSearchQueryEntity]) -> List[BulkSear if q.q is None: prefixed_q = q.q elif isinstance(q.q, str): - if _is_image(q.q): - prefixed_q = q.q - else: + modality = infer_modality(q.q, q.mediaDownloadHeaders) + if modality == Modality.TEXT: prefixed_q = f"{text_query_prefix}{q.q}" + else: + prefixed_q = q.q else: # q.q is dict prefixed_q = {} for key, value in q.q.items(): # Apply prefix if key is not an image or if index does not treat URLs and pointers as images - if _is_image(key): - prefixed_q[key] = value + modality = infer_modality(key, q.mediaDownloadHeaders) + if modality == Modality.TEXT: + prefixed_q[key] = f"{text_query_prefix}{value}" else: - prefixed_q[f"{text_query_prefix}{key}"] = value - + prefixed_q[key] = value new_query_object = BulkSearchQueryEntity( q=prefixed_q, searchableAttributes=q.searchableAttributes, @@ -2042,7 +2049,7 @@ def add_prefix_to_queries(queries: List[BulkSearchQueryEntity]) -> List[BulkSear filter=q.filter, attributesToRetrieve=q.attributesToRetrieve, boost=q.boost, - image_download_headers=q.image_download_headers, + mediaDownloadHeaders=q.mediaDownloadHeaders, context=q.context, scoreModifiers=q.scoreModifiers, index=q.index, @@ -2087,7 +2094,7 @@ def _vector_text_search( ef_search: Optional[int] = None, approximate: bool = True, searchable_attributes: Iterable[str] = None, filter_string: str = None, device: str = None, attributes_to_retrieve: Optional[List[str]] = None, boost: Optional[Dict] = None, - image_download_headers: Optional[Dict] = None, context: Optional[SearchContext] = None, + media_download_headers: Optional[Dict] = None, context: Optional[SearchContext] = None, score_modifiers: Optional[ScoreModifierLists] = None, model_auth: Optional[ModelAuth] = None, highlights: bool = False, text_query_prefix: Optional[str] = None) -> Dict: """ @@ -2104,7 +2111,7 @@ def _vector_text_search( verbose: if 0 - nothing is printed. if 1 - data is printed without vectors, if 2 - full objects are printed out attributes_to_retrieve: if set, only returns these fields - image_download_headers: headers for downloading images + media_download_headers: headers for downloading media context: a dictionary to allow custom vectors in search score_modifiers: a dictionary to modify the score based on field values, for tensor search only model_auth: Authorisation details for downloading a model (if required) @@ -2153,7 +2160,7 @@ def _vector_text_search( queries = [BulkSearchQueryEntity( q=query, searchableAttributes=searchable_attributes, searchMethod=SearchMethod.TENSOR, limit=result_count, offset=offset, showHighlights=False, filter=filter_string, attributesToRetrieve=attributes_to_retrieve, - boost=boost, image_download_headers=image_download_headers, context=context, scoreModifiers=score_modifiers, + boost=boost, mediaDownloadHeaders=media_download_headers, context=context, scoreModifiers=score_modifiers, index=marqo_index, modelAuth=model_auth, text_query_prefix=text_query_prefix )] From 574e61d3642a24a6f5e00cee32af7bf234d75aeb Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 10:34:33 +1100 Subject: [PATCH 03/29] Finish development --- src/marqo/api/models/embed_request.py | 28 +++++++++++++++++-- src/marqo/core/embed/embed.py | 13 +++++---- .../embedding_models/abstract_clip_model.py | 2 +- src/marqo/s2_inference/clip_utils.py | 2 +- .../s2_inference/multimodal_model_load.py | 20 ++++++------- src/marqo/s2_inference/onnx_clip_utils.py | 2 +- src/marqo/tensor_search/api.py | 2 +- 7 files changed, 46 insertions(+), 23 deletions(-) diff --git a/src/marqo/api/models/embed_request.py b/src/marqo/api/models/embed_request.py index 9ca47422e..ff16f6a3a 100644 --- a/src/marqo/api/models/embed_request.py +++ b/src/marqo/api/models/embed_request.py @@ -6,6 +6,8 @@ import pydantic from typing import Union, List, Dict, Optional, Any +from pydantic import Field, root_validator + from marqo.tensor_search.models.private_models import ModelAuth from marqo.tensor_search.models.api_models import BaseMarqoModel from marqo.core.embed.embed import EmbedContentType @@ -15,9 +17,10 @@ class EmbedRequest(BaseMarqoModel): # content can be a single query or list of queries. Queries can be a string or a dictionary. content: Union[str, Dict[str, float], List[Union[str, Dict[str, float]]]] - image_download_headers: Optional[Dict] = None + image_download_headers: Optional[Dict] = Field(default=None, alias="imageDownloadHeaders") + mediaDownloadHeaders: Optional[Dict] = Field(default=None, alias="mediaDownloadHeaders") modelAuth: Optional[ModelAuth] = None - content_type: Optional[EmbedContentType] = EmbedContentType.Query + content_type: Optional[EmbedContentType] = Field(EmbedContentType.Query, alias=("contentType")) @pydantic.validator('content') def validate_content(cls, value): @@ -47,4 +50,23 @@ def validate_content(cls, value): else: raise ValueError("Embed content should be a string, a dictionary, or a list of strings or dictionaries") - return value \ No newline at end of file + return value + + @root_validator(skip_on_failure=True) + def _validate_image_download_headers_and_media_download_headers(cls, values): + """Validate imageDownloadHeaders and mediaDownloadHeaders. Raise an error if both are set. + + If imageDownloadHeaders is set, set mediaDownloadHeaders to it and use mediaDownloadHeaders in the + rest of the code. + + imageDownloadHeaders is deprecated and will be removed in the future. + """ + image_download_headers = values.get('imageDownloadHeaders') + media_download_headers = values.get('mediaDownloadHeaders') + if image_download_headers and media_download_headers: + raise ValueError("Cannot set both imageDownloadHeaders and mediaDownloadHeaders. " + "The imageDownloadHeaders is deprecated and will be removed in the future. " + "Use mediaDownloadHeaders instead.") + if image_download_headers: + values['mediaDownloadHeaders'] = image_download_headers + return values \ No newline at end of file diff --git a/src/marqo/core/embed/embed.py b/src/marqo/core/embed/embed.py index 29d6fcf54..4730ddcd5 100644 --- a/src/marqo/core/embed/embed.py +++ b/src/marqo/core/embed/embed.py @@ -34,11 +34,12 @@ def validate_default_device(cls, value): return value def embed_content( - self, content: Union[str, Dict[str, float], List[Union[str, Dict[str, float]]]], - index_name: str, device: str = None, image_download_headers: Optional[Dict] = None, - model_auth: Optional[ModelAuth] = None, - content_type: Optional[EmbedContentType] = EmbedContentType.Query - ) -> Dict: + self, content: Union[str, Dict[str, float], List[Union[str, Dict[str, float]]]], + index_name: str, device: str = None, + media_download_headers: Optional[Dict] = None, + model_auth: Optional[ModelAuth] = None, + content_type: Optional[EmbedContentType] = EmbedContentType.Query + ) -> Dict: """ Use the index's model to embed the content @@ -105,7 +106,7 @@ def embed_content( BulkSearchQueryEntity( q=content_entry, index=marqo_index, - image_download_headers=image_download_headers, + mediaDownloadHeaders=media_download_headers, modelAuth=model_auth, text_query_prefix=prefix # TODO: Check if it's fine that we leave out the other parameters diff --git a/src/marqo/core/inference/embedding_models/abstract_clip_model.py b/src/marqo/core/inference/embedding_models/abstract_clip_model.py index 1b2a33b23..b89728a5d 100644 --- a/src/marqo/core/inference/embedding_models/abstract_clip_model.py +++ b/src/marqo/core/inference/embedding_models/abstract_clip_model.py @@ -68,7 +68,7 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], if is_image: logger.debug('image') - image_download_headers = kwargs.get("image_download_headers", dict()) + image_download_headers = kwargs.get("media_download_headers", dict()) return self.encode_image(inputs, normalize=normalize, image_download_headers=image_download_headers) else: logger.debug('text') diff --git a/src/marqo/s2_inference/clip_utils.py b/src/marqo/s2_inference/clip_utils.py index 342e6d849..f4f7acde1 100644 --- a/src/marqo/s2_inference/clip_utils.py +++ b/src/marqo/s2_inference/clip_utils.py @@ -485,7 +485,7 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], if is_image: logger.debug('image') - image_download_headers = kwargs.get("image_download_headers", dict()) + image_download_headers = kwargs.get("media_download_headers", dict()) return self.encode_image(inputs, normalize=normalize, image_download_headers=image_download_headers) else: logger.debug('text') diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 0c7ed9431..e57f74c6a 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -110,15 +110,15 @@ def preprocessor(self, modality): raise ValueError("Model has not been loaded yet. Call _load_model() first.") return self.encoder.preprocessor(modality) - def encode(self, content, modality, **kwargs): + def encode(self, content, modality, media_download_headers: Optional[Dict]=None, **kwargs): if self.encoder is None: raise ValueError("Model has not been loaded yet. Call _load_model() first.") - return self.encoder.encode(content, modality, **kwargs) + return self.encoder.encode(content, modality, media_download_headers, **kwargs) class ModelEncoder(ABC): @abstractmethod - def encode(self, content, modality, **kwargs): + def encode(self, content, modality, media_download_headers, **kwargs): pass @@ -126,8 +126,8 @@ class DefaultEncoder(ModelEncoder): def __init__(self, model): self.model = model - def encode(self, content, modality, **kwargs): - return self.model.encode(content, **kwargs) + def encode(self, content, modality, media_download_headers, **kwargs): + return self.model.encode(content, media_download_headers, **kwargs) @contextmanager @@ -251,7 +251,7 @@ def preprocessor(self, modality): return self._preprocessors.get(modality) - def encode(self, content, modality, normalize=True, **kwargs): + def encode(self, content, modality, normalize=True, media_download_headers: Optional[Dict]=None, **kwargs): inputs = {} if modality == Modality.TEXT: @@ -269,7 +269,7 @@ def encode(self, content, modality, normalize=True, **kwargs): with open(temp_filename, 'wb') as f: f.write(content) elif isinstance(content, str) and "http" in content: - self._download_content(content, temp_filename) + self._download_content(content, temp_filename, media_download_headers) else: return self.encode([content], modality=Modality.TEXT) @@ -280,7 +280,7 @@ def encode(self, content, modality, normalize=True, **kwargs): if isinstance(content, str) and "http" in content: suffix = ".mp4" if modality == Modality.VIDEO else ".wav" with self._temp_file(suffix) as temp_filename: - self._download_content(content, temp_filename) + self._download_content(content, temp_filename, media_download_headers) preprocessed_content = self.preprocessor(modality)([temp_filename], return_tensors='pt') inputs[modality.value] = to_device(preprocessed_content, self.model.device)['pixel_values'] @@ -302,11 +302,11 @@ def encode(self, content, modality, normalize=True, **kwargs): return embeddings.cpu().numpy() - def _download_content(self, url, filename): + def _download_content(self, url, filename, media_download_headers: Optional[Dict]=None): # 3 seconds for images, 20 seconds for audio and video timeout_ms = 3000 if filename.endswith(('.png', '.jpg', '.jpeg')) else 20000 - buffer = download_image_from_url(url, {}, timeout_ms) + buffer = download_image_from_url(url, media_download_headers, timeout_ms) with open(filename, 'wb') as f: f.write(buffer.getvalue()) diff --git a/src/marqo/s2_inference/onnx_clip_utils.py b/src/marqo/s2_inference/onnx_clip_utils.py index 31da79185..a9a6ee338 100644 --- a/src/marqo/s2_inference/onnx_clip_utils.py +++ b/src/marqo/s2_inference/onnx_clip_utils.py @@ -167,7 +167,7 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], raise ValueError(f"expected default='image' or default='text' but received {default}") if is_image: - logger.debug('image') + logger.debug('image'), return self.encode_image(inputs, normalize=True) else: logger.debug('text') diff --git a/src/marqo/tensor_search/api.py b/src/marqo/tensor_search/api.py index 18d61776e..8386b8382 100644 --- a/src/marqo/tensor_search/api.py +++ b/src/marqo/tensor_search/api.py @@ -334,7 +334,7 @@ def embed(embedding_request: EmbedRequest, index_name: str, device: str = Depend return marqo_config.embed.embed_content( content=embedding_request.content, index_name=index_name, device=device, - image_download_headers=embedding_request.image_download_headers, + media_download_headers=embedding_request.mediaDownloadHeaders, model_auth=embedding_request.modelAuth, content_type=embedding_request.content_type ) From dfbe81c5062f7524f9ffeb832db429610216efbd Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 11:04:42 +1100 Subject: [PATCH 04/29] Fix more than 2 modalities bugs in search --- src/marqo/tensor_search/tensor_search.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index 3addc0e62..0aaaf61ed 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -1888,8 +1888,10 @@ def vectorise_jobs(jobs: List[VectorisedJobs]) -> Dict[JHash, Dict[str, List[flo # TODO: Handle exception for single job, and allow others to run. try: if v.content: - modality = infer_modality(v.content[0] if isinstance(v.content, list) else v.content, - media_download_headers=v.media_download_headers) + modality = infer_modality( + v.content[0] if isinstance(v.content, list) else v.content, + media_download_headers=v.media_download_headers + ) vectors = s2_inference.vectorise( model_name=v.model_name, model_properties=v.model_properties, content=v.content, device=v.device, @@ -1950,7 +1952,6 @@ def get_query_vectors_from_jobs( ( get_content_vector( possible_jobs=qidx_to_job[qidx], - jobs=jobs, job_to_vectors=job_to_vectors, content=content ), @@ -1984,7 +1985,6 @@ def get_query_vectors_from_jobs( # result[qidx] = vectors[0] result[qidx] = get_content_vector( possible_jobs=qidx_to_job.get(qidx, []), - jobs=jobs, job_to_vectors=job_to_vectors, content=q.q ) @@ -1993,14 +1993,16 @@ def get_query_vectors_from_jobs( return result -def get_content_vector(possible_jobs: List[VectorisedJobPointer], job_to_vectors: Dict[JHash, Dict[str, List[float]]], - jobs: Dict[JHash, VectorisedJobs], content: str) -> List[float]: +def get_content_vector( + possible_jobs: List[VectorisedJobPointer], + job_to_vectors: Dict[JHash, Dict[str, List[float]]], + content: str +) -> List[float]: """finds the vector associated with a piece of content Args: possible_jobs: The jobs where the target vector may reside - treat_urls_as_media: an index_parameter that indicates whether content should be treated as image, audio, video - if it has a URL structure + job_to_vectors: The mapping of job to vectors content: The content to search Returns: @@ -2010,10 +2012,8 @@ def get_content_vector(possible_jobs: List[VectorisedJobPointer], job_to_vectors """ not_found_error = RuntimeError(f"get_content_vector(): could not find corresponding vector for content `{content}`") for vec_job_pointer in possible_jobs: - try: + if content in job_to_vectors[vec_job_pointer.job_hash]: return job_to_vectors[vec_job_pointer.job_hash][content] - except KeyError: - raise not_found_error raise not_found_error From 3cc7a2f1d596f02e92b9063f904ecde6951821b1 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 12:32:51 +1100 Subject: [PATCH 05/29] Need to fix infer issue --- .../embedding_models/abstract_clip_model.py | 4 +-- src/marqo/s2_inference/clip_utils.py | 7 ++-- .../s2_inference/multimodal_model_load.py | 2 +- src/marqo/s2_inference/s2_inference.py | 2 +- src/marqo/tensor_search/add_docs.py | 32 +++++++++---------- src/marqo/tensor_search/tensor_search.py | 4 +-- 6 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/marqo/core/inference/embedding_models/abstract_clip_model.py b/src/marqo/core/inference/embedding_models/abstract_clip_model.py index b89728a5d..43eb3a849 100644 --- a/src/marqo/core/inference/embedding_models/abstract_clip_model.py +++ b/src/marqo/core/inference/embedding_models/abstract_clip_model.py @@ -53,8 +53,8 @@ def encode_text(self, inputs: Union[str, List[str]], normalize: bool = True) -> def encode_image(self, inputs, normalize: bool = True, image_download_headers: dict = None) -> np.ndarray: pass - def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], - default: str = 'text', normalize=True, **kwargs) -> np.ndarray: + def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], normalize=True, **kwargs) -> np.ndarray: + default = "text" infer = kwargs.pop('infer', True) if infer and _is_image(inputs): is_image = True diff --git a/src/marqo/s2_inference/clip_utils.py b/src/marqo/s2_inference/clip_utils.py index f4f7acde1..200b795ab 100644 --- a/src/marqo/s2_inference/clip_utils.py +++ b/src/marqo/s2_inference/clip_utils.py @@ -177,6 +177,8 @@ def download_image_from_url(image_path: str, image_download_headers: dict, timeo c.setopt(pycurl.FOLLOWLOCATION, 1) headers = DEFAULT_HEADERS.copy() + if image_download_headers is None: + image_download_headers = dict() headers.update(image_download_headers) c.setopt(pycurl.HTTPHEADER, [f"{k}: {v}" for k, v in headers.items()]) @@ -467,9 +469,8 @@ def encode_image(self, images: Union[str, ImageType, List[Union[str, ImageType, assert outputs.shape == _shape_before return self._convert_output(outputs) - def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], - default: str = 'text', normalize=True, **kwargs) -> FloatTensor: - + def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], normalize=True, **kwargs) -> FloatTensor: + default = "text" infer = kwargs.pop('infer', True) if infer and _is_image(inputs): diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index e57f74c6a..61e4992c6 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -127,7 +127,7 @@ def __init__(self, model): self.model = model def encode(self, content, modality, media_download_headers, **kwargs): - return self.model.encode(content, media_download_headers, **kwargs) + return self.model.encode(content, modality=modality, media_download_headers=media_download_headers, **kwargs) @contextmanager diff --git a/src/marqo/s2_inference/s2_inference.py b/src/marqo/s2_inference/s2_inference.py index 874565efa..a5d92fb34 100644 --- a/src/marqo/s2_inference/s2_inference.py +++ b/src/marqo/s2_inference/s2_inference.py @@ -53,7 +53,7 @@ def vectorise(model_name: str, content: Union[str, List[str], List[Image], List[ enable_cache: bool = False, modality: Modality = Modality.TEXT, media_download_headers: Optional[Dict] = None, - infer: bool = False + infer: bool = True ) -> List[List[float]]: """Vectorise the given content using the given model. diff --git a/src/marqo/tensor_search/add_docs.py b/src/marqo/tensor_search/add_docs.py index c87a7c78f..1643c432d 100644 --- a/src/marqo/tensor_search/add_docs.py +++ b/src/marqo/tensor_search/add_docs.py @@ -39,7 +39,6 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], media_repo: dict, tensor_fields: List[str], - image_download_headers: dict, device: str = None, media_field_types_mapping: Optional[Dict[str, FieldType]] = None, media_download_headers: Optional[Dict] = None, @@ -118,7 +117,7 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], continue try: - media_repo[doc[field]] = clip_utils.load_image_from_path(doc[field], image_download_headers, + media_repo[doc[field]] = clip_utils.load_image_from_path(doc[field], media_download_headers, timeout_ms=int( TIMEOUT_SECONDS * 1000), metrics_obj=metric_obj) @@ -166,7 +165,7 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], continue try: - processed_chunks = download_and_chunk_media(doc[field], device, download_headers, inferred_modality, + processed_chunks = download_and_chunk_media(doc[field], device, media_download_headers, inferred_modality, marqo_index_type, marqo_index_model, preprocessors, audio_preprocessing, video_preprocessing) media_repo[doc[field]] = processed_chunks @@ -188,7 +187,7 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], try: media_repo[sub_field] = clip_utils.load_image_from_path( sub_field, - image_download_headers, + media_download_headers, timeout=TIMEOUT_SECONDS, metrics_obj=metric_obj ) @@ -289,11 +288,10 @@ def _determine_thread_count(marqo_index: MarqoIndex, add_docs_params: AddDocsPar @contextmanager def download_and_preprocess_content(docs: List[dict], thread_count: int, tensor_fields: List[str], - image_download_headers: dict, model_name: str, normalize_embeddings: bool, media_field_types_mapping: Optional[Dict[str, FieldType]], - media_download_headers: Optional[Dict] = None, # Optional for now + media_download_headers: Optional[Dict] = None, model_properties: Optional[Dict] = None, model_auth: Optional[ModelAuth] = None, device: Optional[str] = None, @@ -309,7 +307,6 @@ def download_and_preprocess_content(docs: List[dict], thread_count: int, tensor_ docs = docs, thread_count = thread_count, tensor_fields = tensor_fields, - image_download_headers = image_download_headers, model_name = model_name, normalize_embeddings = normalize_embeddings, force_download = force_download, @@ -336,15 +333,17 @@ def download_and_preprocess_content(docs: List[dict], thread_count: int, tensor_ pass -def process_batch(docs: List[dict], thread_count: int, tensor_fields: List[str], - image_download_headers: dict, model_name: str, normalize_embeddings: bool, - force_download: bool, media_field_types_mapping: Optional[Dict[str, FieldType]], - model_properties: Optional[Dict], - model_auth: Optional[ModelAuth], device: Optional[str], - patch_method_exists: bool, marqo_index_type: Optional[IndexType], marqo_index_model: Optional[Model], - media_download_headers: Optional[Dict] = None, - audio_preprocessing: Optional[AudioPreProcessing] = None, - video_preprocessing: Optional[VideoPreProcessing] = None) -> dict: +def process_batch( + docs: List[dict], thread_count: int, tensor_fields: List[str], + model_name: str, normalize_embeddings: bool, + force_download: bool, media_field_types_mapping: Optional[Dict[str, FieldType]], + model_properties: Optional[Dict], + model_auth: Optional[ModelAuth], device: Optional[str], + patch_method_exists: bool, marqo_index_type: Optional[IndexType], marqo_index_model: Optional[Model], + media_download_headers: Optional[Dict] = None, + audio_preprocessing: Optional[AudioPreProcessing] = None, + video_preprocessing: Optional[VideoPreProcessing] = None +) -> dict: docs_per_thread = math.ceil(len(docs) / thread_count) copied = copy.deepcopy(docs) @@ -373,7 +372,6 @@ def process_batch(docs: List[dict], thread_count: int, tensor_fields: List[str], allocation, media_repo, tensor_fields, - image_download_headers, device, media_field_types_mapping, media_download_headers, diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index 0aaaf61ed..654f3a292 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -2372,7 +2372,7 @@ def vectorise_multimodal_combination_field_unstructured(field: str, model_name=marqo_index.model.name, model_properties=marqo_index.model.properties, content=prefixed_text_content_to_vectorise, device=device, normalize_embeddings=normalize_embeddings, - infer=False, model_auth=model_auth, modality=Modality.TEXT + infer=True, model_auth=model_auth, modality=Modality.TEXT ) vectors_list.extend(text_vectors) @@ -2596,7 +2596,7 @@ def vectorise_multimodal_combination_field_structured( content=prefixed_text_content, device=device, normalize_embeddings=normalize_embeddings, - infer=False, + infer=True, model_auth=model_auth, modality=Modality.TEXT ) From aa2b1d64bdc34fc6e4a3f00401fcf112cf07f961 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 12:49:13 +1100 Subject: [PATCH 06/29] Revert changes in src/marqo/s2_inference/ and reconsider parameters passing --- src/marqo/s2_inference/clip_utils.py | 9 +- .../s2_inference/multimodal_model_load.py | 32 ++++--- src/marqo/s2_inference/onnx_clip_utils.py | 2 +- src/marqo/s2_inference/s2_inference.py | 84 +++++-------------- 4 files changed, 39 insertions(+), 88 deletions(-) diff --git a/src/marqo/s2_inference/clip_utils.py b/src/marqo/s2_inference/clip_utils.py index 200b795ab..342e6d849 100644 --- a/src/marqo/s2_inference/clip_utils.py +++ b/src/marqo/s2_inference/clip_utils.py @@ -177,8 +177,6 @@ def download_image_from_url(image_path: str, image_download_headers: dict, timeo c.setopt(pycurl.FOLLOWLOCATION, 1) headers = DEFAULT_HEADERS.copy() - if image_download_headers is None: - image_download_headers = dict() headers.update(image_download_headers) c.setopt(pycurl.HTTPHEADER, [f"{k}: {v}" for k, v in headers.items()]) @@ -469,8 +467,9 @@ def encode_image(self, images: Union[str, ImageType, List[Union[str, ImageType, assert outputs.shape == _shape_before return self._convert_output(outputs) - def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], normalize=True, **kwargs) -> FloatTensor: - default = "text" + def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], + default: str = 'text', normalize=True, **kwargs) -> FloatTensor: + infer = kwargs.pop('infer', True) if infer and _is_image(inputs): @@ -486,7 +485,7 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], nor if is_image: logger.debug('image') - image_download_headers = kwargs.get("media_download_headers", dict()) + image_download_headers = kwargs.get("image_download_headers", dict()) return self.encode_image(inputs, normalize=normalize, image_download_headers=image_download_headers) else: logger.debug('text') diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 61e4992c6..173630c22 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -11,11 +11,10 @@ from pydantic import BaseModel from enum import Enum from abc import ABC, abstractmethod -from typing import List, Dict, Any, Union, Optional +from typing import List, Dict, Any, Union from PIL.Image import Image import torch from urllib.parse import quote -from marqo.core.inference.image_download import DEFAULT_HEADERS from marqo.s2_inference.multimodal_model_load import * @@ -110,15 +109,15 @@ def preprocessor(self, modality): raise ValueError("Model has not been loaded yet. Call _load_model() first.") return self.encoder.preprocessor(modality) - def encode(self, content, modality, media_download_headers: Optional[Dict]=None, **kwargs): + def encode(self, content, modality, **kwargs): if self.encoder is None: raise ValueError("Model has not been loaded yet. Call _load_model() first.") - return self.encoder.encode(content, modality, media_download_headers, **kwargs) + return self.encoder.encode(content, modality, **kwargs) class ModelEncoder(ABC): @abstractmethod - def encode(self, content, modality, media_download_headers, **kwargs): + def encode(self, content, modality, **kwargs): pass @@ -126,14 +125,13 @@ class DefaultEncoder(ModelEncoder): def __init__(self, model): self.model = model - def encode(self, content, modality, media_download_headers, **kwargs): - return self.model.encode(content, modality=modality, media_download_headers=media_download_headers, **kwargs) + def encode(self, content, modality, **kwargs): + return self.model.encode(content, **kwargs) @contextmanager -def fetch_content_sample(url, media_download_headers: Optional[dict] = None, sample_size=10240): # 10 KB - # It's ok to pass None to requests.get() for headers and it won't change the default headers - response = requests.get(url, stream=True, headers=media_download_headers) +def fetch_content_sample(url, sample_size=10240): # 10 KB + response = requests.get(url, stream=True) buffer = io.BytesIO() try: for chunk in response.iter_content(chunk_size=min(sample_size, 8192)): @@ -147,7 +145,7 @@ def fetch_content_sample(url, media_download_headers: Optional[dict] = None, sam response.close() -def infer_modality(content: Union[str, List[str], bytes], media_download_headers: Optional[dict] = None) -> Modality: +def infer_modality(content: Union[str, List[str], bytes]) -> Modality: """ Infer the modality of the content. Video, audio, image or text. """ @@ -169,7 +167,7 @@ def infer_modality(content: Union[str, List[str], bytes], media_download_headers if validate_url(encoded_url): # Use context manager to handle content sample try: - with fetch_content_sample(encoded_url, media_download_headers) as sample: + with fetch_content_sample(encoded_url) as sample: mime = magic.from_buffer(sample.read(), mime=True) if mime.startswith('image/'): return Modality.IMAGE @@ -251,7 +249,7 @@ def preprocessor(self, modality): return self._preprocessors.get(modality) - def encode(self, content, modality, normalize=True, media_download_headers: Optional[Dict]=None, **kwargs): + def encode(self, content, modality, normalize=True, **kwargs): inputs = {} if modality == Modality.TEXT: @@ -269,7 +267,7 @@ def encode(self, content, modality, normalize=True, media_download_headers: Opti with open(temp_filename, 'wb') as f: f.write(content) elif isinstance(content, str) and "http" in content: - self._download_content(content, temp_filename, media_download_headers) + self._download_content(content, temp_filename) else: return self.encode([content], modality=Modality.TEXT) @@ -280,7 +278,7 @@ def encode(self, content, modality, normalize=True, media_download_headers: Opti if isinstance(content, str) and "http" in content: suffix = ".mp4" if modality == Modality.VIDEO else ".wav" with self._temp_file(suffix) as temp_filename: - self._download_content(content, temp_filename, media_download_headers) + self._download_content(content, temp_filename) preprocessed_content = self.preprocessor(modality)([temp_filename], return_tensors='pt') inputs[modality.value] = to_device(preprocessed_content, self.model.device)['pixel_values'] @@ -302,11 +300,11 @@ def encode(self, content, modality, normalize=True, media_download_headers: Opti return embeddings.cpu().numpy() - def _download_content(self, url, filename, media_download_headers: Optional[Dict]=None): + def _download_content(self, url, filename): # 3 seconds for images, 20 seconds for audio and video timeout_ms = 3000 if filename.endswith(('.png', '.jpg', '.jpeg')) else 20000 - buffer = download_image_from_url(url, media_download_headers, timeout_ms) + buffer = download_image_from_url(url, {}, timeout_ms) with open(filename, 'wb') as f: f.write(buffer.getvalue()) diff --git a/src/marqo/s2_inference/onnx_clip_utils.py b/src/marqo/s2_inference/onnx_clip_utils.py index a9a6ee338..31da79185 100644 --- a/src/marqo/s2_inference/onnx_clip_utils.py +++ b/src/marqo/s2_inference/onnx_clip_utils.py @@ -167,7 +167,7 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], raise ValueError(f"expected default='image' or default='text' but received {default}") if is_image: - logger.debug('image'), + logger.debug('image') return self.encode_image(inputs, normalize=True) else: logger.debug('text') diff --git a/src/marqo/s2_inference/s2_inference.py b/src/marqo/s2_inference/s2_inference.py index a5d92fb34..fc97d5300 100644 --- a/src/marqo/s2_inference/s2_inference.py +++ b/src/marqo/s2_inference/s2_inference.py @@ -47,28 +47,8 @@ def vectorise(model_name: str, content: Union[str, List[str], List[Image], List[bytes]], model_properties: dict = None, - device: str = None, - normalize_embeddings: bool = get_default_normalization(), - model_auth: ModelAuth = None, - enable_cache: bool = False, - modality: Modality = Modality.TEXT, - media_download_headers: Optional[Dict] = None, - infer: bool = True - ) -> List[List[float]]: - """Vectorise the given content using the given model. - - Args: - model_name: The name of the model to use. - content: The content to vectorise. - model_properties: The properties of the model to use. - device: The device to use. - normalize_embeddings: Whether to normalize the embeddings. - model_auth: The model authorisation details. - enable_cache: Whether to enable the inference cache. - modality: The modality of the content. - media_download_headers: The media download headers. - infer: Whether to infer the modality. Deprecated and should be replaced by modality. - """ + device: str = None, normalize_embeddings: bool = get_default_normalization(), + model_auth: ModelAuth = None, enable_cache: bool = False, modality: Modality = Modality.TEXT, **kwargs,) -> List[List[float]]: if not device: raise InternalError(message=f"vectorise (internal function) cannot be called without setting device!") @@ -83,37 +63,25 @@ def vectorise(model_name: str, content: Union[str, List[str], List[Image], List[ model = _available_models[model_cache_key][AvailableModelsKey.model] if _marqo_inference_cache.is_enabled() and enable_cache: - return _vectorise_with_cache( - model, model_cache_key, content, normalize_embeddings, modality, - media_download_headers, infer - ) + return _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs) else: - return _vectorise_without_cache( - model_cache_key, content, normalize_embeddings, modality, - media_download_headers, infer - ) + return _vectorise_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) -def _vectorise_with_cache(model, model_cache_key: str, content, normalize_embeddings: bool, modality: Modality, - media_download_headers: Optional[Dict], infer: bool): +def _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs): if isinstance(content, str): vectorised = _marqo_inference_cache.get(model_cache_key, content) if vectorised is None: - vectorised = _encode_without_cache( - model_cache_key, content, normalize_embeddings, modality, media_download_headers, infer - ) + vectorised = _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) _marqo_inference_cache.set(model_cache_key, content, vectorised[0]) else: vectorised = _convert_cached_embeddings_to_output(vectorised) return vectorised elif isinstance(content, list): - return _vectorise_list_with_cache( - model, model_cache_key, content, normalize_embeddings, modality, media_download_headers, infer - ) + return _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs) else: raise TypeError(f"Unsupported content type: {type(content).__name__}") -def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, - media_download_headers, infer): +def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs): contents_to_vectorise = [] cached_output = [] @@ -129,8 +97,7 @@ def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embedd contents_to_vectorise.append(content_item) if contents_to_vectorise: - vectorised_outputs = _encode_without_cache( - model_cache_key, contents_to_vectorise, normalize_embeddings, modality, media_download_headers, infer) + vectorised_outputs = _encode_without_cache(model_cache_key, contents_to_vectorise, normalize_embeddings, modality, **kwargs) # Cache the vectorised outputs for content_item, vectorised_output in zip(contents_to_vectorise, vectorised_outputs): if isinstance(content_item, str): @@ -143,32 +110,20 @@ def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embedd return vectorised_outputs +def _vectorise_without_cache(model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], + normalize_embeddings: bool, modality: Modality, **kwargs) -> List[List[float]]: + return _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) -def _vectorise_without_cache( - model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], - normalize_embeddings: bool, modality: Modality, - media_download_headers: Optional[Dict], infer: bool -) -> List[List[float]]: - return _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, media_download_headers, infer) - -def _encode_without_cache( - model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], - normalize_embeddings: bool, modality: Modality, media_download_headers: Optional[Dict], infer: bool) \ - -> List[List[float]]: +def _encode_without_cache(model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], + normalize_embeddings: bool, modality: Modality, **kwargs) -> List[List[float]]: try: model = _available_models[model_cache_key][AvailableModelsKey.model] encoder = get_encoder(model) if isinstance(content, str): - vectorised = model.encode( - content, normalize=normalize_embeddings, modality=modality, - media_download_headers=media_download_headers, infer=infer - ) + vectorised = model.encode(content, normalize=normalize_embeddings, modality=modality, **kwargs) elif isinstance(content, (torch.Tensor, torch.FloatTensor)): - vectorised = model.encode( - content, normalize=normalize_embeddings, modality=modality, - media_download_headers=media_download_headers, infer=infer - ) + vectorised = model.encode(content, normalize=normalize_embeddings, modality=modality, **kwargs) else: vector_batches = [] batch_size = _get_max_vectorise_batch_size() @@ -178,10 +133,9 @@ def _encode_without_cache( modality = infer_modality(batch[0] if isinstance(batch[0], (str, bytes)) else batch) # TODO maybe the infer parameter can be replaced by modality - encoded_batch = encoder.encode( - batch, modality=modality, normalize=normalize_embeddings, - infer=infer, media_download_headers=media_download_headers - ) + infer = kwargs.pop('infer', False if modality == Modality.TEXT else True) + encoded_batch = encoder.encode(batch, modality=modality, normalize=normalize_embeddings, + infer=infer, **kwargs) vector_batches.append(_convert_tensor_to_numpy(encoded_batch)) From 2dbdb009d895a6b759648a75e86fc1e83c5095af Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 14:13:29 +1100 Subject: [PATCH 07/29] Fix tests --- src/marqo/s2_inference/clip_utils.py | 4 +- .../s2_inference/multimodal_model_load.py | 32 +++++----- src/marqo/s2_inference/s2_inference.py | 58 +++++++++++++------ 3 files changed, 61 insertions(+), 33 deletions(-) diff --git a/src/marqo/s2_inference/clip_utils.py b/src/marqo/s2_inference/clip_utils.py index 342e6d849..ff787dd08 100644 --- a/src/marqo/s2_inference/clip_utils.py +++ b/src/marqo/s2_inference/clip_utils.py @@ -177,6 +177,8 @@ def download_image_from_url(image_path: str, image_download_headers: dict, timeo c.setopt(pycurl.FOLLOWLOCATION, 1) headers = DEFAULT_HEADERS.copy() + if image_download_headers is None: + image_download_headers = dict() headers.update(image_download_headers) c.setopt(pycurl.HTTPHEADER, [f"{k}: {v}" for k, v in headers.items()]) @@ -485,7 +487,7 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], if is_image: logger.debug('image') - image_download_headers = kwargs.get("image_download_headers", dict()) + image_download_headers = kwargs.get("media_download_headers", dict()) return self.encode_image(inputs, normalize=normalize, image_download_headers=image_download_headers) else: logger.debug('text') diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 173630c22..61e4992c6 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -11,10 +11,11 @@ from pydantic import BaseModel from enum import Enum from abc import ABC, abstractmethod -from typing import List, Dict, Any, Union +from typing import List, Dict, Any, Union, Optional from PIL.Image import Image import torch from urllib.parse import quote +from marqo.core.inference.image_download import DEFAULT_HEADERS from marqo.s2_inference.multimodal_model_load import * @@ -109,15 +110,15 @@ def preprocessor(self, modality): raise ValueError("Model has not been loaded yet. Call _load_model() first.") return self.encoder.preprocessor(modality) - def encode(self, content, modality, **kwargs): + def encode(self, content, modality, media_download_headers: Optional[Dict]=None, **kwargs): if self.encoder is None: raise ValueError("Model has not been loaded yet. Call _load_model() first.") - return self.encoder.encode(content, modality, **kwargs) + return self.encoder.encode(content, modality, media_download_headers, **kwargs) class ModelEncoder(ABC): @abstractmethod - def encode(self, content, modality, **kwargs): + def encode(self, content, modality, media_download_headers, **kwargs): pass @@ -125,13 +126,14 @@ class DefaultEncoder(ModelEncoder): def __init__(self, model): self.model = model - def encode(self, content, modality, **kwargs): - return self.model.encode(content, **kwargs) + def encode(self, content, modality, media_download_headers, **kwargs): + return self.model.encode(content, modality=modality, media_download_headers=media_download_headers, **kwargs) @contextmanager -def fetch_content_sample(url, sample_size=10240): # 10 KB - response = requests.get(url, stream=True) +def fetch_content_sample(url, media_download_headers: Optional[dict] = None, sample_size=10240): # 10 KB + # It's ok to pass None to requests.get() for headers and it won't change the default headers + response = requests.get(url, stream=True, headers=media_download_headers) buffer = io.BytesIO() try: for chunk in response.iter_content(chunk_size=min(sample_size, 8192)): @@ -145,7 +147,7 @@ def fetch_content_sample(url, sample_size=10240): # 10 KB response.close() -def infer_modality(content: Union[str, List[str], bytes]) -> Modality: +def infer_modality(content: Union[str, List[str], bytes], media_download_headers: Optional[dict] = None) -> Modality: """ Infer the modality of the content. Video, audio, image or text. """ @@ -167,7 +169,7 @@ def infer_modality(content: Union[str, List[str], bytes]) -> Modality: if validate_url(encoded_url): # Use context manager to handle content sample try: - with fetch_content_sample(encoded_url) as sample: + with fetch_content_sample(encoded_url, media_download_headers) as sample: mime = magic.from_buffer(sample.read(), mime=True) if mime.startswith('image/'): return Modality.IMAGE @@ -249,7 +251,7 @@ def preprocessor(self, modality): return self._preprocessors.get(modality) - def encode(self, content, modality, normalize=True, **kwargs): + def encode(self, content, modality, normalize=True, media_download_headers: Optional[Dict]=None, **kwargs): inputs = {} if modality == Modality.TEXT: @@ -267,7 +269,7 @@ def encode(self, content, modality, normalize=True, **kwargs): with open(temp_filename, 'wb') as f: f.write(content) elif isinstance(content, str) and "http" in content: - self._download_content(content, temp_filename) + self._download_content(content, temp_filename, media_download_headers) else: return self.encode([content], modality=Modality.TEXT) @@ -278,7 +280,7 @@ def encode(self, content, modality, normalize=True, **kwargs): if isinstance(content, str) and "http" in content: suffix = ".mp4" if modality == Modality.VIDEO else ".wav" with self._temp_file(suffix) as temp_filename: - self._download_content(content, temp_filename) + self._download_content(content, temp_filename, media_download_headers) preprocessed_content = self.preprocessor(modality)([temp_filename], return_tensors='pt') inputs[modality.value] = to_device(preprocessed_content, self.model.device)['pixel_values'] @@ -300,11 +302,11 @@ def encode(self, content, modality, normalize=True, **kwargs): return embeddings.cpu().numpy() - def _download_content(self, url, filename): + def _download_content(self, url, filename, media_download_headers: Optional[Dict]=None): # 3 seconds for images, 20 seconds for audio and video timeout_ms = 3000 if filename.endswith(('.png', '.jpg', '.jpeg')) else 20000 - buffer = download_image_from_url(url, {}, timeout_ms) + buffer = download_image_from_url(url, media_download_headers, timeout_ms) with open(filename, 'wb') as f: f.write(buffer.getvalue()) diff --git a/src/marqo/s2_inference/s2_inference.py b/src/marqo/s2_inference/s2_inference.py index fc97d5300..d1b60a606 100644 --- a/src/marqo/s2_inference/s2_inference.py +++ b/src/marqo/s2_inference/s2_inference.py @@ -45,10 +45,12 @@ -def vectorise(model_name: str, content: Union[str, List[str], List[Image], List[bytes]], - model_properties: dict = None, - device: str = None, normalize_embeddings: bool = get_default_normalization(), - model_auth: ModelAuth = None, enable_cache: bool = False, modality: Modality = Modality.TEXT, **kwargs,) -> List[List[float]]: +def vectorise( + model_name: str, content: Union[str, List[str], List[Image], List[bytes]], + model_properties: dict = None, + device: str = None, normalize_embeddings: bool = get_default_normalization(), + model_auth: ModelAuth = None, enable_cache: bool = False, modality: Modality = Modality.TEXT, + media_download_headers: Optional[Dict] = None, **kwargs) -> List[List[float]]: if not device: raise InternalError(message=f"vectorise (internal function) cannot be called without setting device!") @@ -63,25 +65,36 @@ def vectorise(model_name: str, content: Union[str, List[str], List[Image], List[ model = _available_models[model_cache_key][AvailableModelsKey.model] if _marqo_inference_cache.is_enabled() and enable_cache: - return _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs) + return _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, + media_download_headers, **kwargs) else: - return _vectorise_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) + return _vectorise_without_cache(model_cache_key, content, normalize_embeddings, modality, media_download_headers, + **kwargs) -def _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs): +def _vectorise_with_cache(model, model_cache_key, content, normalize_embeddings, modality, media_download_headers, + **kwargs): if isinstance(content, str): vectorised = _marqo_inference_cache.get(model_cache_key, content) if vectorised is None: - vectorised = _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) + vectorised = _encode_without_cache( + model_cache_key, content, normalize_embeddings, modality, media_download_headers, + **kwargs + ) _marqo_inference_cache.set(model_cache_key, content, vectorised[0]) else: vectorised = _convert_cached_embeddings_to_output(vectorised) return vectorised elif isinstance(content, list): - return _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs) + return _vectorise_list_with_cache( + model, model_cache_key, content, normalize_embeddings, modality, + media_download_headers, + **kwargs + ) else: raise TypeError(f"Unsupported content type: {type(content).__name__}") -def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, **kwargs): +def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embeddings, modality, media_download_headers, + **kwargs): contents_to_vectorise = [] cached_output = [] @@ -97,7 +110,10 @@ def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embedd contents_to_vectorise.append(content_item) if contents_to_vectorise: - vectorised_outputs = _encode_without_cache(model_cache_key, contents_to_vectorise, normalize_embeddings, modality, **kwargs) + vectorised_outputs = _encode_without_cache( + model_cache_key, contents_to_vectorise, normalize_embeddings, modality, + media_download_headers, **kwargs + ) # Cache the vectorised outputs for content_item, vectorised_output in zip(contents_to_vectorise, vectorised_outputs): if isinstance(content_item, str): @@ -110,18 +126,25 @@ def _vectorise_list_with_cache(model, model_cache_key, content, normalize_embedd return vectorised_outputs -def _vectorise_without_cache(model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], - normalize_embeddings: bool, modality: Modality, **kwargs) -> List[List[float]]: + +def _vectorise_without_cache( + model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], + normalize_embeddings: bool, modality: Modality, media_download_headers, + **kwargs) -> List[List[float]]: return _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) def _encode_without_cache(model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], - normalize_embeddings: bool, modality: Modality, **kwargs) -> List[List[float]]: + normalize_embeddings: bool, modality: Modality, media_download_headers: Optional[Dict]=None, + **kwargs) -> List[List[float]]: try: model = _available_models[model_cache_key][AvailableModelsKey.model] encoder = get_encoder(model) if isinstance(content, str): - vectorised = model.encode(content, normalize=normalize_embeddings, modality=modality, **kwargs) + vectorised = model.encode( + content, normalize=normalize_embeddings, modality=modality, + media_download_headers=media_download_headers, **kwargs + ) elif isinstance(content, (torch.Tensor, torch.FloatTensor)): vectorised = model.encode(content, normalize=normalize_embeddings, modality=modality, **kwargs) else: @@ -134,8 +157,9 @@ def _encode_without_cache(model_cache_key: str, content: Union[str, List[str], L # TODO maybe the infer parameter can be replaced by modality infer = kwargs.pop('infer', False if modality == Modality.TEXT else True) - encoded_batch = encoder.encode(batch, modality=modality, normalize=normalize_embeddings, - infer=infer, **kwargs) + encoded_batch = encoder.encode( + batch, modality=modality, normalize=normalize_embeddings, + media_download_headers=media_download_headers, infer = infer, **kwargs) vector_batches.append(_convert_tensor_to_numpy(encoded_batch)) From 6e4b924428bb2f8eccb8194744f42ab6e2155f10 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 14:26:43 +1100 Subject: [PATCH 08/29] Fix hybrid --- src/marqo/core/search/hybrid_search.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/marqo/core/search/hybrid_search.py b/src/marqo/core/search/hybrid_search.py index 3bc2e4ead..2dfd818c3 100644 --- a/src/marqo/core/search/hybrid_search.py +++ b/src/marqo/core/search/hybrid_search.py @@ -33,7 +33,7 @@ def search( offset: int = 0, ef_search: Optional[int] = None, approximate: bool = True, searchable_attributes: Iterable[str] = None, filter_string: str = None, device: str = None, attributes_to_retrieve: Optional[List[str]] = None, boost: Optional[Dict] = None, - image_download_headers: Optional[Dict] = None, context: Optional[SearchContext] = None, + media_download_headers: Optional[Dict] = None, context: Optional[SearchContext] = None, score_modifiers: Optional[ScoreModifierLists] = None, model_auth: Optional[ModelAuth] = None, highlights: bool = False, text_query_prefix: Optional[str] = None, hybrid_parameters: HybridParameters = None) -> Dict: @@ -51,7 +51,8 @@ def search( verbose: if 0 - nothing is printed. if 1 - data is printed without vectors, if 2 - full objects are printed out attributes_to_retrieve: if set, only returns these fields - image_download_headers: headers for downloading images + media_download_headers: headers for downloading media + context: a dictionary to allow custom vectors in search score_modifiers: a dictionary to modify the score based on field values, should be None for hybrid search model_auth: Authorisation details for downloading a model (if required) @@ -151,7 +152,7 @@ def search( q=query_text_vectorise, searchableAttributes=searchable_attributes, searchMethod=SearchMethod.HYBRID, limit=result_count, offset=offset, showHighlights=False, filter=filter_string, attributesToRetrieve=attributes_to_retrieve, - boost=boost, image_download_headers=image_download_headers, context=context, scoreModifiers=score_modifiers, + boost=boost, media_download_headers=media_download_headers, context=context, scoreModifiers=score_modifiers, index=marqo_index, modelAuth=model_auth, text_query_prefix=text_query_prefix, hybridParameters=hybrid_parameters )] From 0f84ba6b8cbb5700d8763df6db864b9f9d47140b Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 14:43:30 +1100 Subject: [PATCH 09/29] Fix hybrid tests --- src/marqo/core/search/hybrid_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/marqo/core/search/hybrid_search.py b/src/marqo/core/search/hybrid_search.py index 2dfd818c3..9ee8a5264 100644 --- a/src/marqo/core/search/hybrid_search.py +++ b/src/marqo/core/search/hybrid_search.py @@ -152,7 +152,7 @@ def search( q=query_text_vectorise, searchableAttributes=searchable_attributes, searchMethod=SearchMethod.HYBRID, limit=result_count, offset=offset, showHighlights=False, filter=filter_string, attributesToRetrieve=attributes_to_retrieve, - boost=boost, media_download_headers=media_download_headers, context=context, scoreModifiers=score_modifiers, + boost=boost, mediaDownloadHeaders=media_download_headers, context=context, scoreModifiers=score_modifiers, index=marqo_index, modelAuth=model_auth, text_query_prefix=text_query_prefix, hybridParameters=hybrid_parameters )] From 8afca5f24f05611744d72ee77cb0d55b9079bfc9 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 15:16:57 +1100 Subject: [PATCH 10/29] Fix embed --- src/marqo/api/models/embed_request.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/marqo/api/models/embed_request.py b/src/marqo/api/models/embed_request.py index ff16f6a3a..27bee6d8f 100644 --- a/src/marqo/api/models/embed_request.py +++ b/src/marqo/api/models/embed_request.py @@ -9,18 +9,23 @@ from pydantic import Field, root_validator from marqo.tensor_search.models.private_models import ModelAuth -from marqo.tensor_search.models.api_models import BaseMarqoModel +from marqo.base_model import MarqoBaseModel from marqo.core.embed.embed import EmbedContentType -class EmbedRequest(BaseMarqoModel): +class EmbedRequest(MarqoBaseModel): # content can be a single query or list of queries. Queries can be a string or a dictionary. content: Union[str, Dict[str, float], List[Union[str, Dict[str, float]]]] image_download_headers: Optional[Dict] = Field(default=None, alias="imageDownloadHeaders") - mediaDownloadHeaders: Optional[Dict] = Field(default=None, alias="mediaDownloadHeaders") + mediaDownloadHeaders: Optional[Dict] = None modelAuth: Optional[ModelAuth] = None - content_type: Optional[EmbedContentType] = Field(EmbedContentType.Query, alias=("contentType")) + content_type: Optional[EmbedContentType] = Field(default=EmbedContentType.Query, alias="contentType") + + @root_validator(pre=True) + def _test(cls, values): + print(values) + return values @pydantic.validator('content') def validate_content(cls, value): From 414df743d7c992a3b2240157306401ceab82631b Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 16:14:37 +1100 Subject: [PATCH 11/29] Fix embed --- .../semi_structured_add_document_handler.py | 5 +- .../unstructured_add_document_handler.py | 5 +- .../s2_inference/multimodal_model_load.py | 8 ++- src/marqo/tensor_search/tensor_search.py | 2 +- .../test_add_documents_combined.py | 61 ++++++++++++++++++- 5 files changed, 75 insertions(+), 6 deletions(-) diff --git a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py index c82ae3fc1..74ffb4073 100644 --- a/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py +++ b/src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py @@ -41,7 +41,10 @@ def __init__(self, marqo_index: SemiStructuredMarqoIndex, add_docs_params: AddDo def _handle_field(self, marqo_doc, field_name, field_content): self._validate_field(field_name, field_content) - text_field_type = self._infer_field_type(field_content) + text_field_type = self._infer_field_type( + field_content, + media_download_headers=self.add_docs_params.media_download_headers + ) content = self.tensor_fields_container.collect(marqo_doc[MARQO_DOC_ID], field_name, field_content, text_field_type) marqo_doc[field_name] = content diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py b/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py index 31c3b300c..c9f89ccd9 100644 --- a/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py +++ b/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py @@ -70,12 +70,13 @@ def _handle_field(self, marqo_doc, field_name, field_content): field_content, text_field_type) marqo_doc[field_name] = content - def _infer_field_type(self, field_content: Any) -> Optional[FieldType]: + def _infer_field_type(self, field_content: Any, media_download_headers: Optional[Dict] = None) \ + -> Optional[FieldType]: if not isinstance(field_content, str): return None try: - modality = infer_modality(field_content) + modality = infer_modality(field_content, media_download_headers) if not self.marqo_index.treat_urls_and_pointers_as_media and modality in [Modality.AUDIO, Modality.VIDEO]: modality = Modality.TEXT diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 61e4992c6..3a73326b3 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -133,7 +133,14 @@ def encode(self, content, modality, media_download_headers, **kwargs): @contextmanager def fetch_content_sample(url, media_download_headers: Optional[dict] = None, sample_size=10240): # 10 KB # It's ok to pass None to requests.get() for headers and it won't change the default headers + """Fetch a sample of the content from the URL. + + Raises: + HTTPError: If the response status code is not 200 + """ response = requests.get(url, stream=True, headers=media_download_headers) + if response.status_code != 200: + response.raise_for_status() buffer = io.BytesIO() try: for chunk in response.iter_content(chunk_size=min(sample_size, 8192)): @@ -157,7 +164,6 @@ def infer_modality(content: Union[str, List[str], bytes], media_download_headers # Encode the URL encoded_url = encode_url(content) - extension = encoded_url.split('.')[-1].lower() if extension in ['jpg', 'jpeg', 'png', 'gif', 'webp']: return Modality.IMAGE diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index 654f3a292..e84201d03 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -2036,7 +2036,7 @@ def add_prefix_to_queries(queries: List[BulkSearchQueryEntity]) -> List[BulkSear # Apply prefix if key is not an image or if index does not treat URLs and pointers as images modality = infer_modality(key, q.mediaDownloadHeaders) if modality == Modality.TEXT: - prefixed_q[key] = f"{text_query_prefix}{value}" + prefixed_q[f"{text_query_prefix}{key}"] = value else: prefixed_q[key] = value new_query_object = BulkSearchQueryEntity( diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 21c3b90b4..417dc3028 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -15,6 +15,7 @@ import requests import torch from more_itertools import flatten +from numpy.ma.core import subtract from torch import Tensor import unittest.mock @@ -1092,4 +1093,62 @@ def test_textIndexEmbeddingsUnnormalized(self): embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding'] norm = np.linalg.norm(np.array(embeddings)) - self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}") \ No newline at end of file + self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}") + + def test_add_private_images_proper_error_returned(self): + """Test to ensure that private images can not be downloaded and an appropriate error is returned""" + test_indexes = [self.structured_marqo_index_name, self.unstructured_marqo_index_name] + documents = [ + { + "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", + "_id": "1" + }, + { + "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", + "_id": "2" + } + ] + for index_name in test_indexes: + tensor_fields = ["image_field_1"] if index_name == self.unstructured_marqo_index_name else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + self.assertTrue(res.errors) + items = res.items + self.assertEqual(2, len(items)) + for item in items: + self.assertEqual(400, item.status) + self.assertIn("403", item.message) + + def test_add_private_images_success(self): + """Test to ensure that private images can be downloaded with proper headers""" + test_indexes = [self.structured_marqo_index_name, self.unstructured_marqo_index_name] + documents = [ + { + "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", + "_id": "1" + }, + { + "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", + "_id": "2" + } + ] + for index_name in test_indexes: + tensor_fields = ["image_field_1"] if index_name == self.unstructured_marqo_index_name else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields, + media_download_headers={"marqo_media_header": "media_header_test_key"} + ) + ) + self.assertFalse(res.errors) \ No newline at end of file From b5e2195d1e333b8b07fead14141e7bb12aa7724c Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 16:39:42 +1100 Subject: [PATCH 12/29] Add add_documents tests and search tests --- src/marqo/s2_inference/s2_inference.py | 2 +- .../integ_tests/test_search_combined.py | 62 ++++++++++++++++--- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/src/marqo/s2_inference/s2_inference.py b/src/marqo/s2_inference/s2_inference.py index d1b60a606..ce01848f5 100644 --- a/src/marqo/s2_inference/s2_inference.py +++ b/src/marqo/s2_inference/s2_inference.py @@ -131,7 +131,7 @@ def _vectorise_without_cache( model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], normalize_embeddings: bool, modality: Modality, media_download_headers, **kwargs) -> List[List[float]]: - return _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, **kwargs) + return _encode_without_cache(model_cache_key, content, normalize_embeddings, modality, media_download_headers, **kwargs) def _encode_without_cache(model_cache_key: str, content: Union[str, List[str], List[Image], List[bytes]], normalize_embeddings: bool, modality: Modality, media_download_headers: Optional[Dict]=None, diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py index e5e26674f..a87e7cabf 100644 --- a/tests/tensor_search/integ_tests/test_search_combined.py +++ b/tests/tensor_search/integ_tests/test_search_combined.py @@ -1,23 +1,25 @@ import os import uuid from unittest import mock -import torch + import pytest +import torch +from pydantic import ValidationError import marqo.core.exceptions as core_exceptions +from marqo import exceptions as base_exceptions +from marqo.core.models.add_docs_params import AddDocsParams from marqo.core.models.marqo_index import * from marqo.core.models.marqo_index_request import FieldRequest -from marqo.tensor_search import tensor_search -from marqo.tensor_search.enums import SearchMethod -from marqo.core.models.add_docs_params import AddDocsParams -from tests.marqo_test import MarqoTestCase, TestImageUrls -from marqo import exceptions as base_exceptions from marqo.core.models.marqo_query import MarqoLexicalQuery from marqo.core.models.score_modifier import ScoreModifierType, ScoreModifier from marqo.core.structured_vespa_index.structured_vespa_index import StructuredVespaIndex from marqo.core.unstructured_vespa_index.unstructured_vespa_index import UnstructuredVespaIndex +from marqo.s2_inference.errors import MediaDownloadError +from marqo.tensor_search import tensor_search +from marqo.tensor_search.enums import SearchMethod from marqo.tensor_search.models.api_models import SearchQuery -from pydantic import ValidationError +from tests.marqo_test import MarqoTestCase, TestImageUrls class TestSearch(MarqoTestCase): @@ -965,4 +967,48 @@ def test_search_query_CanAcceptDifferentSearchMethods(self): # A special case for no search method provided search_query = SearchQuery(q="test") - self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod) \ No newline at end of file + self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod) + + def test_search_private_images_proper_error_raised(self): + """Test that search raises a MediaDownloadError when trying to access private images""" + test_indexes = [ + self.unstructured_default_image_index, + self.structured_default_image_index + ] + + test_queries = [({ + "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png": 1, + "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small": 1 }, "dictionary queries"), + ("https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", "str queries")] + for index_name in test_indexes: + for query, msg in test_queries: + with self.subTest(msg=f"index: {index_name}, query: {msg}"): + with self.assertRaises(MediaDownloadError): + _ = tensor_search.search( + config=self.config, + index_name=index_name.name, + text=query, + search_method=SearchMethod.TENSOR, + ) + + def test_search_over_private_images_with_media_download_headers(self): + """Test that search can use private images with media download headers""" + test_indexes = [ + self.unstructured_default_image_index, + self.structured_default_image_index + ] + + test_queries = [({ + "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png": 1, + "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small": 1 }, "dictionary queries"), + ("https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", "str queries")] + for index_name in test_indexes: + for query, msg in test_queries: + with self.subTest(msg=f"index: {index_name}, query: {msg}"): + _ = tensor_search.search( + config=self.config, + index_name=index_name.name, + text=query, + search_method=SearchMethod.TENSOR, + media_download_headers={"marqo_media_header": "media_header_test_key"} + ) \ No newline at end of file From 9b2a08a0adbc4328937a69c84096619ba872f652 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 17:12:20 +1100 Subject: [PATCH 13/29] Respond to Farshid's comments --- src/marqo/api/models/add_docs_objects.py | 2 +- src/marqo/api/models/embed_request.py | 7 +--- .../s2_inference/multimodal_model_load.py | 3 +- src/marqo/tensor_search/models/api_models.py | 2 +- src/marqo/tensor_search/tensor_search.py | 2 +- .../tensor_search/test_modalities_download.py | 32 +++++++++---------- 6 files changed, 21 insertions(+), 27 deletions(-) diff --git a/src/marqo/api/models/add_docs_objects.py b/src/marqo/api/models/add_docs_objects.py index 2174753e6..ad5c2b81d 100644 --- a/src/marqo/api/models/add_docs_objects.py +++ b/src/marqo/api/models/add_docs_objects.py @@ -53,7 +53,7 @@ def _validate_image_download_headers_and_media_download_headers(cls, values): media_download_headers = values.get('mediaDownloadHeaders') if image_download_headers and media_download_headers: raise ValueError("Cannot set both imageDownloadHeaders and mediaDownloadHeaders. " - "The imageDownloadHeaders is deprecated and will be removed in the future. " + "'imageDownloadHeaders' is deprecated and will be removed in the future. " "Use mediaDownloadHeaders instead.") if image_download_headers: values['mediaDownloadHeaders'] = image_download_headers diff --git a/src/marqo/api/models/embed_request.py b/src/marqo/api/models/embed_request.py index 27bee6d8f..c1373da6d 100644 --- a/src/marqo/api/models/embed_request.py +++ b/src/marqo/api/models/embed_request.py @@ -22,11 +22,6 @@ class EmbedRequest(MarqoBaseModel): modelAuth: Optional[ModelAuth] = None content_type: Optional[EmbedContentType] = Field(default=EmbedContentType.Query, alias="contentType") - @root_validator(pre=True) - def _test(cls, values): - print(values) - return values - @pydantic.validator('content') def validate_content(cls, value): # Iterate through content list items @@ -70,7 +65,7 @@ def _validate_image_download_headers_and_media_download_headers(cls, values): media_download_headers = values.get('mediaDownloadHeaders') if image_download_headers and media_download_headers: raise ValueError("Cannot set both imageDownloadHeaders and mediaDownloadHeaders. " - "The imageDownloadHeaders is deprecated and will be removed in the future. " + "'imageDownloadHeaders' is deprecated and will be removed in the future. " "Use mediaDownloadHeaders instead.") if image_download_headers: values['mediaDownloadHeaders'] = image_download_headers diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 3a73326b3..2dc6da1bd 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -139,8 +139,7 @@ def fetch_content_sample(url, media_download_headers: Optional[dict] = None, sam HTTPError: If the response status code is not 200 """ response = requests.get(url, stream=True, headers=media_download_headers) - if response.status_code != 200: - response.raise_for_status() + response.raise_for_status() buffer = io.BytesIO() try: for chunk in response.iter_content(chunk_size=min(sample_size, 8192)): diff --git a/src/marqo/tensor_search/models/api_models.py b/src/marqo/tensor_search/models/api_models.py index d688e55f0..3f4bccd97 100644 --- a/src/marqo/tensor_search/models/api_models.py +++ b/src/marqo/tensor_search/models/api_models.py @@ -82,7 +82,7 @@ def _validate_image_download_headers_and_media_download_headers(cls, values): media_download_headers = values.get('mediaDownloadHeaders') if image_download_headers and media_download_headers: raise ValueError("Cannot set both imageDownloadHeaders(image_download_headers) and mediaDownloadHeaders. " - "The imageDownloadHeaders(image_download_headers) is deprecated and will be removed in the future. " + "'imageDownloadHeaders'(image_download_headers) is deprecated and will be removed in the future. " "Use mediaDownloadHeaders instead.") if image_download_headers: values['mediaDownloadHeaders'] = image_download_headers diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index e84201d03..9e7381379 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -2596,7 +2596,7 @@ def vectorise_multimodal_combination_field_structured( content=prefixed_text_content, device=device, normalize_embeddings=normalize_embeddings, - infer=True, + infer=False, model_auth=model_auth, modality=Modality.TEXT ) diff --git a/tests/tensor_search/test_modalities_download.py b/tests/tensor_search/test_modalities_download.py index 55142a5cf..b7158b2be 100644 --- a/tests/tensor_search/test_modalities_download.py +++ b/tests/tensor_search/test_modalities_download.py @@ -62,7 +62,7 @@ def test_image_unstructured_index(self, mock_infer_modality, mock_load_image): tensor_fields = ["field1"] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) @@ -83,7 +83,7 @@ def test_image_structured_index(self, mock_infer_modality, mock_load_image): media_field_types_mapping = {"field1": FieldType.ImagePointer} threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, media_field_types_mapping=media_field_types_mapping @@ -106,7 +106,7 @@ def test_video_unstructured_index(self, mock_infer_modality, mock_download_and_c tensor_fields = ["field1"] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) @@ -130,7 +130,7 @@ def test_audio_structured_index(self, mock_infer_modality, mock_download_and_chu media_field_types_mapping = {"field1": FieldType.AudioPointer} threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, media_field_types_mapping=media_field_types_mapping @@ -148,7 +148,7 @@ def test_unsupported_modality(self, mock_infer_modality): tensor_fields = ["field1"] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) @@ -167,7 +167,7 @@ def test_image_load_error(self, mock_infer_modality, mock_load_image): tensor_fields = ["field1"] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) @@ -189,7 +189,7 @@ def test_video_processing_error(self, mock_infer_modality, mock_download_and_chu tensor_fields = ["field1"] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) @@ -217,7 +217,7 @@ def test_video_and_audio_unstructured_index(self, mock_infer_modality, mock_down # Call the function threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) @@ -233,10 +233,10 @@ def test_video_and_audio_unstructured_index(self, mock_infer_modality, mock_down # Verify the calls to download_and_chunk_media mock_download_and_chunk.assert_any_call( - self.mock_video_url, "cpu", None, Modality.VIDEO, self.mock_marqo_index.type, self.mock_marqo_index.model, None, None, None + self.mock_video_url, "cpu", {}, Modality.VIDEO, self.mock_marqo_index.type, self.mock_marqo_index.model, None, None, None ) mock_download_and_chunk.assert_any_call( - self.mock_audio_url, "cpu", None, Modality.AUDIO, self.mock_marqo_index.type, self.mock_marqo_index.model, None, None, None + self.mock_audio_url, "cpu", {}, Modality.AUDIO, self.mock_marqo_index.type, self.mock_marqo_index.model, None, None, None ) @patch("marqo.tensor_search.add_docs.download_and_chunk_media") @@ -261,7 +261,7 @@ def test_mismatched_media_fields(self, mock_infer_modality, mock_download_and_ch ] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, media_field_types_mapping=media_field_types_mapping @@ -291,7 +291,7 @@ def test_invalid_media_fields(self, mock_infer_modality): mock_infer_modality.side_effect = [Modality.TEXT, Modality.TEXT] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, media_field_types_mapping=media_field_types_mapping @@ -321,7 +321,7 @@ def test_ffmpeg_error_handling(self, mock_infer_modality, mock_download_and_chun mock_download_and_chunk.side_effect = ffmpeg.Error("FFmpeg processing error", stdout=b"", stderr=b"") threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, media_field_types_mapping=media_field_types_mapping @@ -347,7 +347,7 @@ def test_valid_image_processing(self, mock_infer_modality, mock_load_image): media_field_types_mapping = {"image_field": FieldType.ImagePointer} threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, media_field_types_mapping=media_field_types_mapping @@ -365,7 +365,7 @@ def test_media_download_error(self, mock_infer_modality): tensor_fields = ["field1"] threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) @@ -392,7 +392,7 @@ def test_audio_with_video_only_model(self, mock_infer_modality, mock_download_an # Call the function threaded_download_and_preprocess_content( - docs, media_repo, tensor_fields, {}, device="cpu", + docs, media_repo, tensor_fields, media_download_headers={}, device="cpu", marqo_index_type=self.mock_marqo_index.type, marqo_index_model=self.mock_marqo_index.model, ) From 31048f070dcfadfff7b48db54a78848a15c29234 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 17:35:23 +1100 Subject: [PATCH 14/29] Replace all the image_download_headers with media_download_headers --- .../embedding_models/abstract_clip_model.py | 20 +- .../embedding_models/image_download.py | 236 ------------------ .../embedding_models/open_clip_model.py | 4 +- src/marqo/core/inference/image_download.py | 20 +- src/marqo/s2_inference/clip_utils.py | 54 ++-- src/marqo/tensor_search/tensor_search.py | 4 +- tests/s2_inference/test_image_downloading.py | 8 +- .../test_add_documents_combined.py | 10 +- tests/tensor_search/integ_tests/test_embed.py | 10 +- ...test_add_documents_use_existing_tensors.py | 2 +- tests/tensor_search/test_api_utils.py | 18 +- .../test_image_download_headers.py | 24 +- tests/tensor_search/test_search.py | 2 +- 13 files changed, 88 insertions(+), 324 deletions(-) delete mode 100644 src/marqo/core/inference/embedding_models/image_download.py diff --git a/src/marqo/core/inference/embedding_models/abstract_clip_model.py b/src/marqo/core/inference/embedding_models/abstract_clip_model.py index 43eb3a849..42b8c2d8c 100644 --- a/src/marqo/core/inference/embedding_models/abstract_clip_model.py +++ b/src/marqo/core/inference/embedding_models/abstract_clip_model.py @@ -7,7 +7,7 @@ from marqo.core.inference.image_download import (_is_image, format_and_load_CLIP_images, format_and_load_CLIP_image) from marqo.core.inference.embedding_models.abstract_embedding_model import AbstractEmbeddingModel -from marqo.core.inference.embedding_models.image_download import (_is_image, format_and_load_CLIP_images, +from marqo.core.inference.image_download import (_is_image, format_and_load_CLIP_images, format_and_load_CLIP_image) from marqo.s2_inference.logger import get_logger from marqo.s2_inference.types import * @@ -50,7 +50,7 @@ def encode_text(self, inputs: Union[str, List[str]], normalize: bool = True) -> pass @abstractmethod - def encode_image(self, inputs, normalize: bool = True, image_download_headers: dict = None) -> np.ndarray: + def encode_image(self, inputs, normalize: bool = True, media_download_headers: dict = None) -> np.ndarray: pass def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], normalize=True, **kwargs) -> np.ndarray: @@ -68,8 +68,8 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], nor if is_image: logger.debug('image') - image_download_headers = kwargs.get("media_download_headers", dict()) - return self.encode_image(inputs, normalize=normalize, image_download_headers=image_download_headers) + media_download_headers = kwargs.get("media_download_headers", dict()) + return self.encode_image(inputs, normalize=normalize, media_download_headers=media_download_headers) else: logger.debug('text') return self.encode_text(inputs, normalize=normalize) @@ -85,27 +85,27 @@ def normalize(outputs): return outputs.norm(dim=-1, keepdim=True) def _preprocess_images(self, images: Union[str, ImageType, List[Union[str, ImageType, Tensor]], Tensor], - image_download_headers: Optional[Dict] = None) -> Tensor: + media_download_headers: Optional[Dict] = None) -> Tensor: """Preprocess the input image to be ready for the model. Args: images (Union[str, ImageType, List[Union[str, ImageType, Tensor]], Tensor]): input image, can be a str(url), a PIL image, or a tensor, or a list of them - image_download_headers (Optional[Dict]): headers for the image download + media_download_headers (Optional[Dict]): headers for the image download Return: Tensor: the processed image tensor with shape (batch_size, channel, n_px, n_px) """ if self.model is None: self.load() - if image_download_headers is None: - image_download_headers = dict() + if media_download_headers is None: + media_download_headers = dict() # default to batch encoding if isinstance(images, list): image_input: List[Union[ImageType, Tensor]] \ - = format_and_load_CLIP_images(images, image_download_headers) + = format_and_load_CLIP_images(images, media_download_headers) else: - image_input: List[Union[ImageType, Tensor]] = [format_and_load_CLIP_image(images, image_download_headers)] + image_input: List[Union[ImageType, Tensor]] = [format_and_load_CLIP_image(images, media_download_headers)] image_input_processed: Tensor = torch.stack([self.preprocess(_img).to(self.device) \ if not isinstance(_img, torch.Tensor) else _img \ diff --git a/src/marqo/core/inference/embedding_models/image_download.py b/src/marqo/core/inference/embedding_models/image_download.py deleted file mode 100644 index 65c158e20..000000000 --- a/src/marqo/core/inference/embedding_models/image_download.py +++ /dev/null @@ -1,236 +0,0 @@ -import os -from io import BytesIO - -import certifi -import numpy as np -import pycurl -import requests -import torch -import validators -from PIL import Image, UnidentifiedImageError -from requests.utils import requote_uri - -from marqo import marqo_docs -from marqo.api.exceptions import InternalError -from marqo.s2_inference.errors import ImageDownloadError -from marqo.s2_inference.types import * -from marqo.tensor_search.telemetry import RequestMetrics - -# TODO Merge this with the one in clip_utils in the future refactoring - -DEFAULT_HEADERS = {'User-Agent': 'Marqobot/1.0'} - - -def get_allowed_image_types(): - return {'.jpg', '.png', '.bmp', '.jpeg'} - - -def _is_image(inputs: Union[str, List[Union[str, ImageType, ndarray]]]) -> bool: - # some logic to determine if something is an image or not - # assume the batch is the same type - # maybe we use something like this https://github.com/ahupp/python-magic - - _allowed = get_allowed_image_types() - - # we assume the batch is this way if a list - # otherwise apply over each element - if isinstance(inputs, list): - - if len(inputs) == 0: - raise UnidentifiedImageError("received empty list, expected at least one element.") - - thing = inputs[0] - else: - thing = inputs - - # if it is a string, determine if it is a local file or url - if isinstance(thing, str): - name, extension = os.path.splitext(thing.lower()) - - # if it has the correct extension, asssume yes - if extension in _allowed: - return True - - # if it is a local file without extension, then raise an error - if os.path.isfile(thing): - # we could also read the first part of the file and infer - raise UnidentifiedImageError( - f"local file [{thing}] extension {extension} does not match allowed file types of {_allowed}") - else: - # if it is not a local file and does not have an extension - # check if url - if validators.url(thing): - return True - else: - return False - - # if it is an array, then it is an image - elif isinstance(thing, (ImageType, ndarray, Tensor)): - return True - else: - raise UnidentifiedImageError(f"expected type Image or str for inputs but received type {type(thing)}") - - -def format_and_load_CLIP_images(images: List[Union[str, ndarray, ImageType]], image_download_headers: dict) -> List[ - ImageType]: - """takes in a list of strings, arrays or urls and either loads and/or converts to PIL - for the clip model - - Args: - images (List[Union[str, np.ndarray, ImageType]]): list of file locations or arrays (can be mixed) - - Raises: - TypeError: _description_ - - Returns: - List[ImageType]: list of PIL images - """ - if not isinstance(images, list): - raise TypeError(f"expected list but received {type(images)}") - - results = [] - for image in images: - results.append(format_and_load_CLIP_image(image, image_download_headers)) - - return results - - -def format_and_load_CLIP_image(image: Union[str, ndarray, ImageType, Tensor], - image_download_headers: dict) -> Union[ImageType, Tensor]: - """standardizes the input to be a PIL image - - Args: - image (Union[str, np.ndarray, ImageType, Tensor]): can be a local file, url, array or a tensor - - Raises: - ValueError: _description_ - TypeError: _description_ - - Returns: - standardized the image: - ImageType: PIL image if input is a string, an array or a PIL image - Tensor: torch tensor if input is a torch tensor - """ - # check for the input type - if isinstance(image, str): - img = load_image_from_path(image, image_download_headers) - elif isinstance(image, np.ndarray): - img = Image.fromarray(image.astype('uint8'), 'RGB') - elif isinstance(image, torch.Tensor): - img = image - elif isinstance(image, ImageType): - img = image - else: - raise UnidentifiedImageError(f"input of type {type(image)} " - f"did not match allowed types of str, np.ndarray, ImageType, Tensor") - - return img - - -def load_image_from_path(image_path: str, image_download_headers: dict, timeout_ms=3000, - metrics_obj: Optional[RequestMetrics] = None) -> ImageType: - """Loads an image into PIL from a string path that is either local or a url - - Args: - image_path (str): Local or remote path to image. - image_download_headers (dict): header for the image download - timeout_ms (int): timeout (in milliseconds), for the whole request - Raises: - ValueError: If the local path is invalid, and is not a url - UnidentifiedImageError: If the image is irretrievable or unprocessable. - - Returns: - ImageType: In-memory PIL image. - """ - if os.path.isfile(image_path): - img = Image.open(image_path) - elif validators.url(image_path): - if metrics_obj is not None: - metrics_obj.start(f"image_download.{image_path}") - try: - img_io: BytesIO = download_image_from_url(image_path, image_download_headers, timeout_ms) - img = Image.open(img_io) - except ImageDownloadError as e: - raise UnidentifiedImageError(str(e)) from e - finally: - if metrics_obj is not None: - metrics_obj.stop(f"image_download.{image_path}") - else: - raise UnidentifiedImageError(f"Input str of {image_path} is not a local file or a valid url. " - f"If you are using Marqo Cloud, please note that images can only be downloaded " - f"from a URL and local files are not supported. " - f"If you are running Marqo in a Docker container, you will need to use a Docker " - f"volume so that your container can access host files. " - f"For more information, please refer to: " - f"{marqo_docs.indexing_images()}") - - return img - - -def download_image_from_url(image_path: str, image_download_headers: dict, timeout_ms: int = 3000) -> BytesIO: - """Download an image from a URL and return a PIL image using pycurl. - - Args: - image_path (str): URL to the image. - image_download_headers (dict): Headers for the image download. - timeout_ms (int): Timeout in milliseconds, for the whole request. - - Returns: - buffer (BytesIO): The image as a BytesIO object. - - Raises: - ImageDownloadError: If the image download fails. - """ - - if not isinstance(timeout_ms, int): - raise InternalError(f"timeout must be an integer but received {timeout_ms} of type {type(timeout_ms)}") - - try: - encoded_url = encode_url(image_path) - except UnicodeEncodeError as e: - raise ImageDownloadError(f"Marqo encountered an error when downloading the image url {image_path}. " - f"The url could not be encoded properly. Original error: {e}") - buffer = BytesIO() - c = pycurl.Curl() - c.setopt(pycurl.CAINFO, certifi.where()) - c.setopt(pycurl.URL, encoded_url) - c.setopt(pycurl.WRITEDATA, buffer) - c.setopt(pycurl.TIMEOUT_MS, timeout_ms) - c.setopt(pycurl.FOLLOWLOCATION, 1) - - headers = DEFAULT_HEADERS.copy() - headers.update(image_download_headers) - c.setopt(pycurl.HTTPHEADER, [f"{k}: {v}" for k, v in headers.items()]) - - try: - c.perform() - if c.getinfo(pycurl.RESPONSE_CODE) != 200: - raise ImageDownloadError(f"image url `{image_path}` returned {c.getinfo(pycurl.RESPONSE_CODE)}") - except pycurl.error as e: - raise ImageDownloadError(f"Marqo encountered an error when downloading the image url {image_path}. " - f"The original error is: {e}") - finally: - c.close() - buffer.seek(0) - return buffer - - -def encode_url(url: str) -> str: - """ - Encode a URL to a valid format with only ASCII characters and reserved characters using percent-encoding. - - In version 2.8, we replaced the requests library with pycurl for image downloads. Consequently, we need to implement - the URL encoding function ourselves. This function replicates the encoding behavior of the - 'requests.utils.requote_uri' function from the requests library. - - Args: - url (str): The URL to encode. - - Returns: - str: The encoded URL. - - Raises: - UnicodeEncodeError: If the URL cannot be encoded properly. - - """ - return requests.utils.requote_uri(url) diff --git a/src/marqo/core/inference/embedding_models/open_clip_model.py b/src/marqo/core/inference/embedding_models/open_clip_model.py index e79cb9feb..fdc050316 100644 --- a/src/marqo/core/inference/embedding_models/open_clip_model.py +++ b/src/marqo/core/inference/embedding_models/open_clip_model.py @@ -247,10 +247,10 @@ def _download_from_repo(self): return model_file_path def encode_image(self, images: Union[str, ImageType, List[Union[str, ImageType]]], - image_download_headers: Optional[Dict] = None, + media_download_headers: Optional[Dict] = None, normalize=True) -> FloatTensor: - self.image_input_processed: Tensor = self._preprocess_images(images, image_download_headers) + self.image_input_processed: Tensor = self._preprocess_images(images, media_download_headers) with torch.no_grad(): if self.device.startswith("cuda"): diff --git a/src/marqo/core/inference/image_download.py b/src/marqo/core/inference/image_download.py index 65c158e20..9cebb5948 100644 --- a/src/marqo/core/inference/image_download.py +++ b/src/marqo/core/inference/image_download.py @@ -71,7 +71,7 @@ def _is_image(inputs: Union[str, List[Union[str, ImageType, ndarray]]]) -> bool: raise UnidentifiedImageError(f"expected type Image or str for inputs but received type {type(thing)}") -def format_and_load_CLIP_images(images: List[Union[str, ndarray, ImageType]], image_download_headers: dict) -> List[ +def format_and_load_CLIP_images(images: List[Union[str, ndarray, ImageType]], media_download_headers: dict) -> List[ ImageType]: """takes in a list of strings, arrays or urls and either loads and/or converts to PIL for the clip model @@ -90,13 +90,13 @@ def format_and_load_CLIP_images(images: List[Union[str, ndarray, ImageType]], im results = [] for image in images: - results.append(format_and_load_CLIP_image(image, image_download_headers)) + results.append(format_and_load_CLIP_image(image, media_download_headers)) return results def format_and_load_CLIP_image(image: Union[str, ndarray, ImageType, Tensor], - image_download_headers: dict) -> Union[ImageType, Tensor]: + media_download_headers: dict) -> Union[ImageType, Tensor]: """standardizes the input to be a PIL image Args: @@ -113,7 +113,7 @@ def format_and_load_CLIP_image(image: Union[str, ndarray, ImageType, Tensor], """ # check for the input type if isinstance(image, str): - img = load_image_from_path(image, image_download_headers) + img = load_image_from_path(image, media_download_headers) elif isinstance(image, np.ndarray): img = Image.fromarray(image.astype('uint8'), 'RGB') elif isinstance(image, torch.Tensor): @@ -127,13 +127,13 @@ def format_and_load_CLIP_image(image: Union[str, ndarray, ImageType, Tensor], return img -def load_image_from_path(image_path: str, image_download_headers: dict, timeout_ms=3000, +def load_image_from_path(image_path: str, media_download_headers: dict, timeout_ms=3000, metrics_obj: Optional[RequestMetrics] = None) -> ImageType: """Loads an image into PIL from a string path that is either local or a url Args: image_path (str): Local or remote path to image. - image_download_headers (dict): header for the image download + media_download_headers (dict): header for the image download timeout_ms (int): timeout (in milliseconds), for the whole request Raises: ValueError: If the local path is invalid, and is not a url @@ -148,7 +148,7 @@ def load_image_from_path(image_path: str, image_download_headers: dict, timeout_ if metrics_obj is not None: metrics_obj.start(f"image_download.{image_path}") try: - img_io: BytesIO = download_image_from_url(image_path, image_download_headers, timeout_ms) + img_io: BytesIO = download_image_from_url(image_path, media_download_headers, timeout_ms) img = Image.open(img_io) except ImageDownloadError as e: raise UnidentifiedImageError(str(e)) from e @@ -167,12 +167,12 @@ def load_image_from_path(image_path: str, image_download_headers: dict, timeout_ return img -def download_image_from_url(image_path: str, image_download_headers: dict, timeout_ms: int = 3000) -> BytesIO: +def download_image_from_url(image_path: str, media_download_headers: dict, timeout_ms: int = 3000) -> BytesIO: """Download an image from a URL and return a PIL image using pycurl. Args: image_path (str): URL to the image. - image_download_headers (dict): Headers for the image download. + media_download_headers (dict): Headers for the image download. timeout_ms (int): Timeout in milliseconds, for the whole request. Returns: @@ -199,7 +199,7 @@ def download_image_from_url(image_path: str, image_download_headers: dict, timeo c.setopt(pycurl.FOLLOWLOCATION, 1) headers = DEFAULT_HEADERS.copy() - headers.update(image_download_headers) + headers.update(media_download_headers) c.setopt(pycurl.HTTPHEADER, [f"{k}: {v}" for k, v in headers.items()]) try: diff --git a/src/marqo/s2_inference/clip_utils.py b/src/marqo/s2_inference/clip_utils.py index ff787dd08..d1fd5c684 100644 --- a/src/marqo/s2_inference/clip_utils.py +++ b/src/marqo/s2_inference/clip_utils.py @@ -67,7 +67,7 @@ def _get_transform(n_px: int, image_mean: List[float] = None, image_std: List[fl ]) -def format_and_load_CLIP_images(images: List[Union[str, ndarray, ImageType]], image_download_headers: dict) -> List[ +def format_and_load_CLIP_images(images: List[Union[str, ndarray, ImageType]], media_download_headers: dict) -> List[ ImageType]: """takes in a list of strings, arrays or urls and either loads and/or converts to PIL for the clip model @@ -86,18 +86,18 @@ def format_and_load_CLIP_images(images: List[Union[str, ndarray, ImageType]], im results = [] for image in images: - results.append(format_and_load_CLIP_image(image, image_download_headers)) + results.append(format_and_load_CLIP_image(image, media_download_headers)) return results -def load_image_from_path(image_path: str, image_download_headers: dict, timeout_ms=3000, +def load_image_from_path(image_path: str, media_download_headers: dict, timeout_ms=3000, metrics_obj: Optional[RequestMetrics] = None) -> ImageType: """Loads an image into PIL from a string path that is either local or a url Args: image_path (str): Local or remote path to image. - image_download_headers (dict): header for the image download + media_download_headers (dict): header for the image download timeout_ms (int): timeout (in milliseconds), for the whole request Raises: ValueError: If the local path is invalid, and is not a url @@ -112,7 +112,7 @@ def load_image_from_path(image_path: str, image_download_headers: dict, timeout_ if metrics_obj is not None: metrics_obj.start(f"image_download.{image_path}") try: - img_io: BytesIO = download_image_from_url(image_path, image_download_headers, timeout_ms) + img_io: BytesIO = download_image_from_url(image_path, media_download_headers, timeout_ms) img = Image.open(img_io) except ImageDownloadError as e: raise UnidentifiedImageError(str(e)) from e @@ -145,12 +145,12 @@ def validate_url(url: str) -> bool: -def download_image_from_url(image_path: str, image_download_headers: dict, timeout_ms: int = 3000) -> BytesIO: +def download_image_from_url(image_path: str, media_download_headers: dict, timeout_ms: int = 3000) -> BytesIO: """Download an image from a URL and return a PIL image using pycurl. Args: image_path (str): URL to the image. - image_download_headers (dict): Headers for the image download. + media_download_headers (dict): Headers for the image download. timeout_ms (int): Timeout in milliseconds, for the whole request. Returns: @@ -177,9 +177,9 @@ def download_image_from_url(image_path: str, image_download_headers: dict, timeo c.setopt(pycurl.FOLLOWLOCATION, 1) headers = DEFAULT_HEADERS.copy() - if image_download_headers is None: - image_download_headers = dict() - headers.update(image_download_headers) + if media_download_headers is None: + media_download_headers = dict() + headers.update(media_download_headers) c.setopt(pycurl.HTTPHEADER, [f"{k}: {v}" for k, v in headers.items()]) try: @@ -217,7 +217,7 @@ def encode_url(url: str) -> str: def format_and_load_CLIP_image(image: Union[str, ndarray, ImageType, Tensor], - image_download_headers: dict) -> Union[ImageType, Tensor]: + media_download_headers: dict) -> Union[ImageType, Tensor]: """standardizes the input to be a PIL image Args: @@ -234,7 +234,7 @@ def format_and_load_CLIP_image(image: Union[str, ndarray, ImageType, Tensor], """ # check for the input type if isinstance(image, str): - img = load_image_from_path(image, image_download_headers) + img = load_image_from_path(image, media_download_headers) elif isinstance(image, np.ndarray): img = Image.fromarray(image.astype('uint8'), 'RGB') elif isinstance(image, torch.Tensor): @@ -420,27 +420,27 @@ def encode_text(self, sentence: Union[str, List[str]], normalize=True) -> FloatT return self._convert_output(outputs) def _preprocess_images(self, images: Union[str, ImageType, List[Union[str, ImageType, Tensor]], Tensor], - image_download_headers: Optional[Dict] = None) -> Tensor: + media_download_headers: Optional[Dict] = None) -> Tensor: """Preprocess the input image to be ready for the model. Args: images (Union[str, ImageType, List[Union[str, ImageType, Tensor]], Tensor]): input image, can be a str(url), a PIL image, or a tensor, or a list of them - image_download_headers (Optional[Dict]): headers for the image download + media_download_headers (Optional[Dict]): headers for the image download Return: Tensor: the processed image tensor with shape (batch_size, channel, n_px, n_px) """ if self.model is None: self.load() - if image_download_headers is None: - image_download_headers = dict() + if media_download_headers is None: + media_download_headers = dict() # default to batch encoding if isinstance(images, list): image_input: List[Union[ImageType, Tensor]] \ - = format_and_load_CLIP_images(images, image_download_headers) + = format_and_load_CLIP_images(images, media_download_headers) else: - image_input: List[Union[ImageType, Tensor]] = [format_and_load_CLIP_image(images, image_download_headers)] + image_input: List[Union[ImageType, Tensor]] = [format_and_load_CLIP_image(images, media_download_headers)] image_input_processed: Tensor = torch.stack([self.preprocess(_img).to(self.device) \ if not isinstance(_img, torch.Tensor) else _img \ @@ -448,18 +448,18 @@ def _preprocess_images(self, images: Union[str, ImageType, List[Union[str, Image return image_input_processed def encode_image(self, images: Union[str, ImageType, List[Union[str, ImageType, Tensor]], Tensor], - normalize=True, image_download_headers: Optional[Dict] = None) -> FloatTensor: + normalize=True, media_download_headers: Optional[Dict] = None) -> FloatTensor: """Encode the input image to a tensor representation. Args: images (Union[str, ImageType, List[Union[str, ImageType, Tensor]], Tensor]): input image, can be a str(url), a PIL image, or a tensor, or a list of them normalize (bool): whether to normalize the output tensor - image_download_headers (Optional[Dict]): headers for the image download + media_download_headers (Optional[Dict]): headers for the image download Return: FloatTensor: the encoded image tensor with shape (batch_size, embedding_dim) """ - self.image_input_processed: Tensor = self._preprocess_images(images, image_download_headers) + self.image_input_processed: Tensor = self._preprocess_images(images, media_download_headers) with torch.no_grad(): outputs = self.model.encode_image(self.image_input_processed) @@ -487,8 +487,8 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]], if is_image: logger.debug('image') - image_download_headers = kwargs.get("media_download_headers", dict()) - return self.encode_image(inputs, normalize=normalize, image_download_headers=image_download_headers) + media_download_headers = kwargs.get("media_download_headers", dict()) + return self.encode_image(inputs, normalize=normalize, media_download_headers=media_download_headers) else: logger.debug('text') return self.encode_text(inputs, normalize=normalize) @@ -573,16 +573,16 @@ def encode_text(self, sentence: Union[str, List[str]], normalize=True) -> FloatT return self._convert_output(outputs) def encode_image(self, images: Union[str, ImageType, List[Union[str, ImageType]]], - normalize=True, image_download_headers: Optional[dict] = None) -> FloatTensor: + normalize=True, media_download_headers: Optional[dict] = None) -> FloatTensor: if self.visual_model is None: self.load() - if image_download_headers is None: - image_download_headers = dict() + if media_download_headers is None: + media_download_headers = dict() # default to batch encoding if isinstance(images, list): - image_input = format_and_load_CLIP_images(images, image_download_headers) + image_input = format_and_load_CLIP_images(images, media_download_headers) else: image_input = [format_and_load_CLIP_image(images, {})] diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py index 9e7381379..f040f899b 100644 --- a/src/marqo/tensor_search/tensor_search.py +++ b/src/marqo/tensor_search/tensor_search.py @@ -186,7 +186,7 @@ def _add_documents_unstructured(config: Config, add_docs_params: AddDocsParams, docs=docs, thread_count=media_download_thread_count, tensor_fields=tensor_fields_and_multimodal_subfields, - image_download_headers=add_docs_params.image_download_headers, + media_download_headers=add_docs_params.media_download_headers, model_name=marqo_index.model.name, normalize_embeddings=marqo_index.normalize_embeddings, media_field_types_mapping=None, @@ -709,7 +709,7 @@ def _add_documents_structured(config: Config, add_docs_params: AddDocsParams, ma docs=docs, thread_count=media_download_thread_count, tensor_fields=media_fields, - image_download_headers=add_docs_params.image_download_headers, + media_download_headers=add_docs_params.media_download_headers, # add non image download headers in the future model_name=marqo_index.model.name, normalize_embeddings=marqo_index.normalize_embeddings, diff --git a/tests/s2_inference/test_image_downloading.py b/tests/s2_inference/test_image_downloading.py index 89f88200f..29a214024 100644 --- a/tests/s2_inference/test_image_downloading.py +++ b/tests/s2_inference/test_image_downloading.py @@ -53,12 +53,12 @@ def test_download_image_from_url_handleDifferentUrlsCorrectly(self): for url, expected, msg in self.test_cases: with self.subTest(url=url, expected=expected, msg=msg): with self.assertRaises(ImageDownloadError) as cm: - download_image_from_url(image_path=url + ".jpg", image_download_headers={}) + download_image_from_url(image_path=url + ".jpg", media_download_headers={}) def test_download_image_from_url_handlesUrlRequiringUserAgentHeader(self): url_requiring_user_agent_header = "https://docs.marqo.ai/2.0.0/Examples/marqo.jpg" try: - download_image_from_url(image_path=url_requiring_user_agent_header, image_download_headers={}) + download_image_from_url(image_path=url_requiring_user_agent_header, media_download_headers={}) except Exception as e: self.fail(f"Exception was raised when downloading {url_requiring_user_agent_header}: {e}") @@ -77,7 +77,7 @@ def test_download_image_from_url_mergesDefaultHeadersWithCustomHeaders(self, moc for (headers, expected_headers, msg) in test_cases: with self.subTest(headers=headers, expected_headers=expected_headers, msg=msg): - download_image_from_url('http://example.com/image.jpg', image_download_headers=headers) + download_image_from_url('http://example.com/image.jpg', media_download_headers=headers) mock_curl_instance.setopt.assert_called_with(pycurl.HTTPHEADER, expected_headers) def test_download_image_from_url_handlesRedirection(self): @@ -88,5 +88,5 @@ def test_download_image_from_url_handlesRedirection(self): ]) with MockHttpServer(app).run_in_thread() as base_url: - result = download_image_from_url(f'{base_url}/missing_image.jpg', image_download_headers={}) + result = download_image_from_url(f'{base_url}/missing_image.jpg', media_download_headers={}) self.assertEqual(result.getvalue(), image_content) diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 417dc3028..4a9238a66 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -578,7 +578,7 @@ def test_imageDownloadWithoutPreprocessor(self): allocated_docs=[test_doc], media_repo=media_repo, tensor_fields=['field_1', 'field_2'], - image_download_headers={}, + media_download_headers={}, marqo_index_type=IndexType.Unstructured, marqo_index_model=Model(name="test", properties={}), ) @@ -598,7 +598,7 @@ def test_imageDownloadWithPreprocessor(self): allocated_docs=[test_doc], media_repo=media_repo, tensor_fields=['field_1', 'field_2'], - image_download_headers={}, + media_download_headers={}, preprocessors={'image': lambda x: torch.randn(3, 224, 224)}, device='cpu', marqo_index_type=IndexType.Unstructured, @@ -620,7 +620,7 @@ def run(): {"Title": "frog", "Desc": "blah"}, {"Title": "Dog", "Loc": "https://google.com/my_dog.png"}], media_repo=media_repo, tensor_fields=['Title', 'Desc', 'Loc'], - image_download_headers={}, + media_download_headers={}, marqo_index_type=IndexType.Unstructured, marqo_index_model=Model(name="test", properties={}), ) @@ -709,7 +709,7 @@ def test_threaded_download_images_non_tensor_field(self): allocated_docs=docs, media_repo=media_repo, tensor_fields=['field_1', 'field_2'], - image_download_headers={}, + media_download_headers={}, marqo_index_type=IndexType.Unstructured, marqo_index_model=Model(name="test", properties={}), ) @@ -761,7 +761,7 @@ def test_download_images_non_tensor_field(self): docs=docs, thread_count=20, tensor_fields=['field_1', 'field_2'], - image_download_headers={}, + media_download_headers={}, model_name="ViT-B/32", normalize_embeddings=True, model_properties=model_properties, diff --git a/tests/tensor_search/integ_tests/test_embed.py b/tests/tensor_search/integ_tests/test_embed.py index a971ced5d..77ba44f98 100644 --- a/tests/tensor_search/integ_tests/test_embed.py +++ b/tests/tensor_search/integ_tests/test_embed.py @@ -523,9 +523,9 @@ def run(): self.assertEqual(embed_res["content"], [image_url]) self.assertTrue(np.allclose(embed_res["embeddings"][0], search_query_embedding)) - def test_embed_with_image_download_headers_and_model_auth(self): + def test_embed_with_media_download_headers_and_model_auth(self): """ - Ensure that vectorise is called with the correct image_download_headers and model_auth + Ensure that vectorise is called with the correct media_download_headers and model_auth when using the embed endpoint. """ for index in [self.unstructured_default_image_index, self.structured_default_image_index]: @@ -537,7 +537,7 @@ def pass_through_vectorise(*arg, **kwargs): via mock Set image download headers and model auth to None so there's no error out. """ - kwargs["image_download_headers"] = None + kwargs["media_download_headers"] = None kwargs["model_auth"] = None return vectorise(*arg, **kwargs) @@ -549,7 +549,7 @@ def run(): marqo_config=self.config, index_name=index.name, embedding_request=EmbedRequest( content=[image_url], - image_download_headers={"Authorization": "my secret key"}, + media_download_headers={"Authorization": "my secret key"}, modelAuth=ModelAuth(s3=S3Auth( aws_access_key_id='12345', aws_secret_access_key='this-is-a-secret')) @@ -564,7 +564,7 @@ def run(): self.assertEqual(len(call_args), 1) vectorise_kwargs = call_args[0].kwargs - self.assertEqual(vectorise_kwargs["image_download_headers"], {"Authorization": "my secret key"}) + self.assertEqual(vectorise_kwargs["media_download_headers"], {"Authorization": "my secret key"}) self.assertEqual(vectorise_kwargs["model_auth"], ModelAuth(s3=S3Auth( aws_access_key_id='12345', aws_secret_access_key='this-is-a-secret'))) diff --git a/tests/tensor_search/test_add_documents_use_existing_tensors.py b/tests/tensor_search/test_add_documents_use_existing_tensors.py index cd9ea8e88..b1febcfc3 100644 --- a/tests/tensor_search/test_add_documents_use_existing_tensors.py +++ b/tests/tensor_search/test_add_documents_use_existing_tensors.py @@ -829,7 +829,7 @@ def run(): vectorised_content = [call_kwargs['content'] for call_args, call_kwargs in mock_vectorise.call_args_list] - artefact_pil_image = load_image_from_path(artefact_hippo_img, image_download_headers={}) + artefact_pil_image = load_image_from_path(artefact_hippo_img, media_download_headers={}) expected_to_be_vectorised = [ ["this is the updated 1st sentence.", "This is my second"], ["this is a brand new sentence.", "Yes it is"], diff --git a/tests/tensor_search/test_api_utils.py b/tests/tensor_search/test_api_utils.py index 437d81654..acb040651 100644 --- a/tests/tensor_search/test_api_utils.py +++ b/tests/tensor_search/test_api_utils.py @@ -98,13 +98,13 @@ def test_add_docs_params_orchestrator(self): # Query parameters should be parsed as default values non_tensor_fields = [] use_existing_tensors = False - image_download_headers = dict() + media_download_headers = dict() model_auth = None mappings = dict() # Call the function with the arguments result = add_docs_params_orchestrator(index_name, body, device, non_tensor_fields, mappings, - model_auth, image_download_headers, use_existing_tensors) + model_auth, media_download_headers, use_existing_tensors) # Assert that the result is as expected assert isinstance(result, AddDocsParams) @@ -114,7 +114,7 @@ def test_add_docs_params_orchestrator(self): assert result.non_tensor_fields == ["field1"] assert result.use_existing_tensors == True assert result.docs == [{"test": "doc"}] - assert result.image_download_headers == {"header1": "value1"} + assert result.media_download_headers == {"header1": "value1"} def test_add_docs_params_orchestrator_deprecated_query_parameters(self): # Set up the arguments for the function @@ -126,14 +126,14 @@ def test_add_docs_params_orchestrator_deprecated_query_parameters(self): device = "test-device" non_tensor_fields = ["field1"] use_existing_tensors = True - image_download_headers = {"header1": "value1"} + media_download_headers = {"header1": "value1"} model_auth = model_auth mappings = {"map1": "value1"} auto_refresh = True # Call the function with the arguments result = add_docs_params_orchestrator(index_name, body, device, auto_refresh, non_tensor_fields, mappings, - model_auth, image_download_headers, use_existing_tensors) + model_auth, media_download_headers, use_existing_tensors) # Assert that the result is as expected assert isinstance(result, AddDocsParams) @@ -143,7 +143,7 @@ def test_add_docs_params_orchestrator_deprecated_query_parameters(self): assert result.non_tensor_fields == ["field1"] assert result.use_existing_tensors == True assert result.docs == [{"test": "doc"}] - assert result.image_download_headers == {"header1": "value1"} + assert result.media_download_headers == {"header1": "value1"} def test_add_docs_params_orchestrator_error(self): # Test the case where the function should raise an error due to invalid input @@ -155,7 +155,7 @@ def test_add_docs_params_orchestrator_error(self): device = "test-device" non_tensor_fields = ["field1"] use_existing_tensors = True - image_download_headers = {"header1": "value1"} + media_download_headers = {"header1": "value1"} model_auth = model_auth mappings = {"map1": "value1"} auto_refresh = True @@ -163,7 +163,7 @@ def test_add_docs_params_orchestrator_error(self): # Use pytest.raises to check for the error try: _ = add_docs_params_orchestrator(index_name, body, device, auto_refresh, non_tensor_fields, mappings, - model_auth, image_download_headers, use_existing_tensors) + model_auth, media_download_headers, use_existing_tensors) except InternalError as e: self.assertIn("Unexpected request body type", str(e)) @@ -181,7 +181,7 @@ def test_add_docs_params_orchestrator_deprecated_query_parameters_error(self): mappings={"map1": "value1"}) params = {"non_tensor_fields": ["what"], "use_existing_tensors": True, - "image_download_headers": {"header2": "value2"}, "model_auth": model_auth, + "media_download_headers": {"header2": "value2"}, "model_auth": model_auth, "mappings": {"map2": "value2"}} for param, value in params.items(): diff --git a/tests/tensor_search/test_image_download_headers.py b/tests/tensor_search/test_image_download_headers.py index ea692be9e..04c0ef0a7 100644 --- a/tests/tensor_search/test_image_download_headers.py +++ b/tests/tensor_search/test_image_download_headers.py @@ -62,11 +62,11 @@ def test_img_download_search(self): tensor_search.create_vector_index( config=self.config, index_name=self.index_name_1, index_settings=self.image_index_settings() ) - image_download_headers = {"Authorization": "some secret key blah"} + media_download_headers = {"Authorization": "some secret key blah"} self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"_id": "1", "image": self.real_img_url}], - auto_refresh=True, image_download_headers=image_download_headers, device="cpu")) + auto_refresh=True, media_download_headers=media_download_headers, device="cpu")) def pass_through_requests_get(url, *args, **kwargs): return requests_get(url, *args, **kwargs) @@ -80,11 +80,11 @@ def pass_through_requests_get(url, *args, **kwargs): # Perform a vector search search_res = tensor_search._vector_text_search( config=self.config, index_name=self.index_name_1, - result_count=1, query=self.real_img_url, image_download_headers=image_download_headers, device="cpu" + result_count=1, query=self.real_img_url, media_download_headers=media_download_headers, device="cpu" ) # Check if the image URL was called at least once with the correct headers image_url_called = any( - call_args[0] == self.real_img_url and call_kwargs.get('headers', None) == image_download_headers + call_args[0] == self.real_img_url and call_kwargs.get('headers', None) == media_download_headers for call_args, call_kwargs in mock_get.call_args_list ) assert image_url_called, "Image URL not called with the correct headers" @@ -102,18 +102,18 @@ def pass_through_load_image_from_path(*arg, **kwargs): @unittest.mock.patch("marqo.s2_inference.clip_utils.load_image_from_path", mock_load_image_from_path) def run(): - image_download_headers = {"Authorization": "some secret key blah"} + media_download_headers = {"Authorization": "some secret key blah"} # Add a document with an image URL self.add_documents(config=self.config, add_docs_params=AddDocsParams( index_name=self.index_name_1, docs=[ {"_id": "1", "image": self.real_img_url} - ], auto_refresh=True, image_download_headers=image_download_headers, device="cpu" + ], auto_refresh=True, media_download_headers=media_download_headers, device="cpu" )) # Check if load_image_from_path was called with the correct headers assert len(mock_load_image_from_path.call_args_list) == 1 call_args, call_kwargs = mock_load_image_from_path.call_args_list[0] - assert image_download_headers in call_args + assert media_download_headers in call_args return True assert run() is True @@ -123,14 +123,14 @@ def test_img_download_bulk_search(self): tensor_search.create_vector_index(config=self.config, index_name=self.index_name_1, index_settings=self.image_index_settings()) test_image_url = self.real_img_url - image_download_headers = {"Authorization": "some secret key blah"} + media_download_headers = {"Authorization": "some secret key blah"} def pass_through_load_image_from_path(*args, **kwargs): return load_image_from_path(*args, **kwargs) def pass_through_requests_get(url, *args, **kwargs): if url == test_image_url: - assert kwargs.get('headers', None) == image_download_headers + assert kwargs.get('headers', None) == media_download_headers return requests_get(url, *args, **kwargs) # Mock the load_image_from_path function @@ -144,7 +144,7 @@ def pass_through_requests_get(url, *args, **kwargs): "_id": "1", "image": test_image_url, }], - auto_refresh=True, image_download_headers=image_download_headers, device="cpu")) + auto_refresh=True, media_download_headers=media_download_headers, device="cpu")) # Set up the mock GET mock_get = unittest.mock.MagicMock() @@ -155,13 +155,13 @@ def pass_through_requests_get(url, *args, **kwargs): bulk_search_query = BulkSearchQuery(queries=[{ "index": self.index_name_1, "q": self.real_img_url, - "image_download_headers": image_download_headers + "media_download_headers": media_download_headers }]) resp = tensor_search.bulk_search(marqo_config=self.config, query=bulk_search_query) # Check if the image URL was called at least once with the correct headers image_url_called = any( - call_args[0] == test_image_url and call_kwargs.get('headers', None) == image_download_headers + call_args[0] == test_image_url and call_kwargs.get('headers', None) == media_download_headers for call_args, call_kwargs in mock_get.call_args_list ) assert image_url_called, "Image URL not called with the correct headers" diff --git a/tests/tensor_search/test_search.py b/tests/tensor_search/test_search.py index c44848c12..0a0fbdc15 100644 --- a/tests/tensor_search/test_search.py +++ b/tests/tensor_search/test_search.py @@ -1136,7 +1136,7 @@ def run() -> typing.List[float]: weighted_vectors = [] for q, weight in multi_query.items(): vec = vectorise(model_name="ViT-B/16", content=[q, ], - image_download_headers=None, normalize_embeddings=True, + media_download_headers=None, normalize_embeddings=True, device="cpu")[0] weighted_vectors.append(np.asarray(vec) * weight) From 2e35bacca465983bf2ee4ef735f0cd35af9af3e4 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 19:24:13 +1100 Subject: [PATCH 15/29] Fix tests --- .../unstructured_add_document_handler.py | 2 +- src/marqo/s2_inference/multimodal_model_load.py | 1 - tests/s2_inference/test_vectorise.py | 3 ++- tests/tensor_search/integ_tests/test_add_documents_combined.py | 3 ++- tests/tensor_search/integ_tests/test_embed.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py b/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py index c9f89ccd9..7915455aa 100644 --- a/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py +++ b/src/marqo/core/unstructured_vespa_index/unstructured_add_document_handler.py @@ -65,7 +65,7 @@ def _validate_doc(self, doc): def _handle_field(self, marqo_doc, field_name, field_content): self._validate_field(field_name, field_content) - text_field_type = self._infer_field_type(field_content) + text_field_type = self._infer_field_type(field_content, self.add_docs_params.media_download_headers) content = self.tensor_fields_container.collect(marqo_doc[MARQO_DOC_ID], field_name, field_content, text_field_type) marqo_doc[field_name] = content diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index 2dc6da1bd..ad4bb2506 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -170,7 +170,6 @@ def infer_modality(content: Union[str, List[str], bytes], media_download_headers return Modality.VIDEO elif extension in ['mp3', 'wav', 'ogg']: return Modality.AUDIO - if validate_url(encoded_url): # Use context manager to handle content sample try: diff --git a/tests/s2_inference/test_vectorise.py b/tests/s2_inference/test_vectorise.py index 6e51446b0..5ccd1bde4 100644 --- a/tests/s2_inference/test_vectorise.py +++ b/tests/s2_inference/test_vectorise.py @@ -240,7 +240,8 @@ def test_vectorise_single_content_item(self): result = s2_inference.vectorise(model_name='mock_model', content=single_content, model_properties=self.mock_model_props, device="cpu") - self.mock_model.encode.assert_called_once_with(single_content, normalize=True, modality=Modality.TEXT) + self.mock_model.encode.assert_called_once_with(single_content, normalize=True, modality=Modality.TEXT, + media_download_headers=None) self.assertIsInstance(result, list) self.assertEqual(len(result), 1) diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 4a9238a66..2729ad830 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -1128,7 +1128,8 @@ def test_add_private_images_proper_error_returned(self): def test_add_private_images_success(self): """Test to ensure that private images can be downloaded with proper headers""" - test_indexes = [self.structured_marqo_index_name, self.unstructured_marqo_index_name] + # test_indexes = [self.structured_marqo_index_name, self.unstructured_marqo_index_name] + test_indexes = [self.unstructured_marqo_index_name, ] documents = [ { "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", diff --git a/tests/tensor_search/integ_tests/test_embed.py b/tests/tensor_search/integ_tests/test_embed.py index 77ba44f98..1e393ad69 100644 --- a/tests/tensor_search/integ_tests/test_embed.py +++ b/tests/tensor_search/integ_tests/test_embed.py @@ -549,7 +549,7 @@ def run(): marqo_config=self.config, index_name=index.name, embedding_request=EmbedRequest( content=[image_url], - media_download_headers={"Authorization": "my secret key"}, + mediaDownloadHeaders={"Authorization": "my secret key"}, modelAuth=ModelAuth(s3=S3Auth( aws_access_key_id='12345', aws_secret_access_key='this-is-a-secret')) From 2cc4622332be2da095f3f7d16aba38ffbf0f90b4 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Thu, 24 Oct 2024 22:12:43 +1100 Subject: [PATCH 16/29] Add language bind modality tests --- .../core/vespa_index/add_documents_handler.py | 1 - tests/marqo_test.py | 15 +++ .../test_add_documents_combined.py | 127 ++++++++++++++++-- .../integ_tests/test_search_combined.py | 102 +++++++++++++- 4 files changed, 230 insertions(+), 15 deletions(-) diff --git a/src/marqo/core/vespa_index/add_documents_handler.py b/src/marqo/core/vespa_index/add_documents_handler.py index b181d35b7..8133abd4d 100644 --- a/src/marqo/core/vespa_index/add_documents_handler.py +++ b/src/marqo/core/vespa_index/add_documents_handler.py @@ -421,4 +421,3 @@ def _field_type_chunker_map(self, media_repo): FieldType.VideoPointer: AudioVideoChunker(media_repo=media_repo), } return chunkers - diff --git a/tests/marqo_test.py b/tests/marqo_test.py index 8d66c2a86..25edff9d7 100644 --- a/tests/marqo_test.py +++ b/tests/marqo_test.py @@ -36,6 +36,21 @@ class TestImageUrls(str, Enum): HIPPO_STATUE = 'https://raw.githubusercontent.com/marqo-ai/marqo-api-tests/mainline/assets/ai_hippo_statue_small.png' +class TestAudioUrls(str, Enum): + __test__ = False + AUDIO1 = "https://marqo-ecs-50-audio-test-dataset.s3.us-east-1.amazonaws.com/audios/1-100032-A-0.wav" + AUDIO2 = "https://marqo-ecs-50-audio-test-dataset.s3.us-east-1.amazonaws.com/audios/1-115545-C-48.wav" + AUDIO3 = "https://marqo-ecs-50-audio-test-dataset.s3.us-east-1.amazonaws.com/audios/1-119125-A-45.wav" + + +class TestVideoUrls(str, Enum): + __test__ = False + VIDEO1 = "https://marqo-k400-video-test-dataset.s3.us-east-1.amazonaws.com/videos/--_S9IDQPLg_000135_000145.mp4" + VIDEO2 = "https://marqo-k400-video-test-dataset.s3.us-east-1.amazonaws.com/videos/---QUuC4vJs_000084_000094.mp4" + VIDEO3 = "https://marqo-k400-video-test-dataset.s3.us-east-1.amazonaws.com/videos/--mI_-gaZLk_000018_000028.mp4" + + + class MarqoTestCase(unittest.TestCase): indexes = [] diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 8a9c7d6da..e7cc77ff2 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -1,25 +1,18 @@ import os import unittest.mock +import unittest.mock import uuid from unittest import mock from unittest.mock import patch import PIL -import numpy as np - import numpy as np import pytest - - -import PIL import requests import torch -from more_itertools import flatten -from numpy.ma.core import subtract from torch import Tensor -import unittest.mock - +from marqo.core.models.add_docs_params import AddDocsParams, BatchVectorisationMode from marqo.core.models.marqo_get_documents_by_id_response import MarqoGetDocumentsByIdsResponse from marqo.core.models.marqo_index import * from marqo.core.models.marqo_index_request import FieldRequest @@ -28,10 +21,7 @@ from marqo.tensor_search import add_docs from marqo.tensor_search import streaming_media_processor from marqo.tensor_search import tensor_search -from marqo.core.models.add_docs_params import AddDocsParams, BatchVectorisationMode -from tests.marqo_test import MarqoTestCase, TestImageUrls -from marqo.s2_inference.multimodal_model_load import infer_modality -from marqo.tensor_search import streaming_media_processor +from tests.marqo_test import MarqoTestCase, TestImageUrls, TestAudioUrls, TestVideoUrls class TestAddDocumentsCombined(MarqoTestCase): @@ -1174,4 +1164,115 @@ def test_add_private_images_success(self): mappings=mappings ) ) + self.assertFalse(res.errors) + + + + +@pytest.mark.largemodel +class TestLanguageBindModelAddDocumentCombined(MarqoTestCase): + """A class to test the add_documents with the LanguageBind model.""" + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + + structured_language_bind_index = cls.structured_marqo_index_request( + name="structured_image_index" + str(uuid.uuid4()).replace('-', ''), + fields=[ + FieldRequest(name="text_field_1", type=FieldType.Text, + features=[FieldFeature.Filter, FieldFeature.LexicalSearch]), + FieldRequest(name="image_field_1", type=FieldType.ImagePointer), + FieldRequest(name="audio_field_1", type=FieldType.AudioPointer), + FieldRequest(name="video_field_1", type=FieldType.VideoPointer), + FieldRequest( + name="multimodal_field", + type=FieldType.MultimodalCombination, + dependent_fields={ + "image_field_1": 1.0, + "text_field_1": 1.0, + "audio_field_1": 1.0, + "video_field_1": 1.0, + } + ) + ], + model=Model(name="LanguageBind/Video_V1.5_FT_Audio_FT_Image"), + tensor_fields=["text_field_1", "image_field_1", "audio_field_1", "video_field_1", "multimodal_field"], + ) + + unstructured_language_bind_index = cls.unstructured_marqo_index_request( + name="unstructured_image_index" + str(uuid.uuid4()).replace('-', ''), + model=Model(name="LanguageBind/Video_V1.5_FT_Audio_FT_Image"), + treat_urls_and_pointers_as_images=True, + treat_urls_and_pointers_as_media=True + ) + + cls.indexes = cls.create_indexes([structured_language_bind_index, unstructured_language_bind_index]) + + cls.structured_language_bind_index_name = structured_language_bind_index.name + cls.unstructured_language_bind_index_name = unstructured_language_bind_index.name + + s2_inference.clear_loaded_models() + + @classmethod + def tearDownClass(cls) -> None: + super().tearDownClass() + s2_inference.clear_loaded_models() + + def test_language_bind_model_can_add_all_media_modalities(self): + """Test to ensure that the LanguageBind model can add all media types to the index""" + documents = [ + { + "text_field_1": "This is a test text", + "image_field_1": TestImageUrls.IMAGE1.value, + "audio_field_1": TestAudioUrls.AUDIO1.value, + "video_field_1": TestVideoUrls.VIDEO1.value, + "_id": "1" + } + ] + for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: + tensor_fields = ["text_field_1", "image_field_1", "audio_field_1", "video_field_1", "multimodal_field"] \ + if index_name == self.unstructured_language_bind_index_name else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + self.assertFalse(res.errors) + + def test_language_bind_model_can_add_all_private_media_modalities(self): + documents = [ + { # With extensions + "text_field_1": "This is a test text", + "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", + "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark.wav", + "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4", + "_id": "1" + }, + { + # No extensions + "text_field_1": "This is a test text", + "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", + "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark", + "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress", + "_id": "1" + } + ] + for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: + tensor_fields = ["text_field_1", "image_field_1", "audio_field_1", "video_field_1", "multimodal_field"] \ + if index_name == self.unstructured_language_bind_index_name else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields, + media_download_headers={"marqo_media_header": "media_header_test_key"} + ) + ) self.assertFalse(res.errors) \ No newline at end of file diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py index 599a63640..285e04ef0 100644 --- a/tests/tensor_search/integ_tests/test_search_combined.py +++ b/tests/tensor_search/integ_tests/test_search_combined.py @@ -18,7 +18,7 @@ from marqo.tensor_search import tensor_search from marqo.tensor_search.enums import SearchMethod from marqo.tensor_search.models.api_models import SearchQuery -from tests.marqo_test import MarqoTestCase, TestImageUrls +from tests.marqo_test import MarqoTestCase, TestImageUrls, TestAudioUrls, TestVideoUrls class TestSearch(MarqoTestCase): @@ -1057,3 +1057,103 @@ def test_lexical_search_DoesNotErrorWithEscapedQuotes(self): ) self.assertEqual(len(expected_ids), len(res['hits'])) self.assertEqual(set(expected_ids), {hit['_id'] for hit in res['hits']}) + + +@pytest.mark.largemodel +class TestLanguageBindModelAddDocumentCombined(MarqoTestCase): + """A class to test the search with the LanguageBind model.""" + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + + structured_language_bind_index = cls.structured_marqo_index_request( + name="structured_image_index" + str(uuid.uuid4()).replace('-', ''), + fields=[ + FieldRequest(name="text_field_1", type=FieldType.Text, + features=[FieldFeature.Filter, FieldFeature.LexicalSearch]), + FieldRequest(name="image_field_1", type=FieldType.ImagePointer), + FieldRequest(name="audio_field_1", type=FieldType.AudioPointer), + FieldRequest(name="video_field_1", type=FieldType.VideoPointer), + FieldRequest( + name="multimodal_field", + type=FieldType.MultimodalCombination, + dependent_fields={ + "image_field_1": 1.0, + "text_field_1": 1.0, + "audio_field_1": 1.0, + "video_field_1": 1.0, + } + ) + ], + model=Model(name="LanguageBind/Video_V1.5_FT_Audio_FT_Image"), + tensor_fields=["text_field_1", "image_field_1", "audio_field_1", "video_field_1", "multimodal_field"], + ) + + unstructured_language_bind_index = cls.unstructured_marqo_index_request( + name="unstructured_image_index" + str(uuid.uuid4()).replace('-', ''), + model=Model(name="LanguageBind/Video_V1.5_FT_Audio_FT_Image"), + treat_urls_and_pointers_as_images=True, + treat_urls_and_pointers_as_media=True + ) + + cls.indexes = cls.create_indexes([structured_language_bind_index, unstructured_language_bind_index]) + + cls.structured_language_bind_index_name = structured_language_bind_index.name + cls.unstructured_language_bind_index_name = unstructured_language_bind_index.name + + s2_inference.clear_loaded_models() + + @classmethod + def tearDownClass(cls) -> None: + super().tearDownClass() + s2_inference.clear_loaded_models() + + def test_language_bind_model_can_search_all_media_modalities(self): + """Test to ensure that the LanguageBind model can search all media types to the index""" + queries = [ + "This is a test text", + TestImageUrls.IMAGE1.value, + TestAudioUrls.AUDIO1.value, + TestVideoUrls.VIDEO1.value, + { + "This is a test text": 1, + TestImageUrls.IMAGE1.value: 1, + TestAudioUrls.AUDIO1.value: 1, + TestVideoUrls.VIDEO1.value: 1 + } + ] + for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: + for query in queries: + with self.subTest(index_name): + _ = tensor_search.search( + config = self.config, + index_name=index_name, + text=query, + search_method=SearchMethod.LEXICAL + ) + + def test_language_bind_model_can_search_all_private_media_modalities(self): + """A test to ensure that the LanguageBind model can search all private media types to the index""" + queries = [ + "This is a test text", + "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", + "https://d2k91vq0avo7lq.cloudfront.net/bark.wav", + "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4", + { + "This is a test text": 1, + "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png": 1, + "https://d2k91vq0avo7lq.cloudfront.net/bark.wav": 1, + "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4": 1 + } + ] + for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: + for query in queries: + with self.subTest(index_name): + _ = tensor_search.search( + config = self.config, + index_name=index_name, + text=query, + search_method=SearchMethod.LEXICAL, + media_download_headers={"marqo_media_header": "media_header_test_key"} + ) \ No newline at end of file From bc35efbd1aec29043970185f24046b11812870b0 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 10:15:59 +1100 Subject: [PATCH 17/29] Fix tests --- tests/tensor_search/integ_tests/test_search_combined.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py index 285e04ef0..f7e826860 100644 --- a/tests/tensor_search/integ_tests/test_search_combined.py +++ b/tests/tensor_search/integ_tests/test_search_combined.py @@ -1060,7 +1060,7 @@ def test_lexical_search_DoesNotErrorWithEscapedQuotes(self): @pytest.mark.largemodel -class TestLanguageBindModelAddDocumentCombined(MarqoTestCase): +class TestLanguageBindModelSearchCombined(MarqoTestCase): """A class to test the search with the LanguageBind model.""" @classmethod @@ -1130,7 +1130,7 @@ def test_language_bind_model_can_search_all_media_modalities(self): config = self.config, index_name=index_name, text=query, - search_method=SearchMethod.LEXICAL + search_method=SearchMethod.TENSOR ) def test_language_bind_model_can_search_all_private_media_modalities(self): @@ -1154,6 +1154,6 @@ def test_language_bind_model_can_search_all_private_media_modalities(self): config = self.config, index_name=index_name, text=query, - search_method=SearchMethod.LEXICAL, + search_method=SearchMethod.TENSOR, media_download_headers={"marqo_media_header": "media_header_test_key"} ) \ No newline at end of file From 674bb17a3f7486750ed3d07855e4c669a34b4223 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 10:56:02 +1100 Subject: [PATCH 18/29] Fix headers for media --- src/marqo/tensor_search/add_docs.py | 21 ++++++++++++------- .../streaming_media_processor.py | 16 ++++++++++---- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/marqo/tensor_search/add_docs.py b/src/marqo/tensor_search/add_docs.py index 1643c432d..9906075a7 100644 --- a/src/marqo/tensor_search/add_docs.py +++ b/src/marqo/tensor_search/add_docs.py @@ -165,9 +165,12 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], continue try: - processed_chunks = download_and_chunk_media(doc[field], device, media_download_headers, inferred_modality, - marqo_index_type, marqo_index_model, preprocessors, - audio_preprocessing, video_preprocessing) + processed_chunks = download_and_chunk_media( + url=doc[field], device=device, modality=inferred_modality, + marqo_index_type=marqo_index_type, marqo_index_model=marqo_index_model, + preprocessors=preprocessors, audio_preprocessing=audio_preprocessing, + video_preprocessing=video_preprocessing, media_download_headers=media_download_headers + ) media_repo[doc[field]] = processed_chunks except (ffmpeg.Error, S2InferenceError) as e: logger.error(f"Error processing {inferred_modality} file: {str(e)}") @@ -197,13 +200,17 @@ def threaded_download_and_preprocess_content(allocated_docs: List[dict], continue -def download_and_chunk_media(url: str, device: str, headers: dict, modality: Modality, marqo_index_type: IndexType, marqo_index_model: Model, +def download_and_chunk_media(url: str, device: str, modality: Modality, marqo_index_type: IndexType, marqo_index_model: Model, preprocessors: Preprocessors, audio_preprocessing: AudioPreProcessing = None, - video_preprocessing: VideoPreProcessing = None) -> List[Dict[str, torch.Tensor]]: + video_preprocessing: VideoPreProcessing = None, + media_download_headers: Optional[Dict] = None) -> List[Dict[str, torch.Tensor]]: MAX_FILE_SIZE = 100 * 1024 * 1024 # 100 MB in bytes - processor = StreamingMediaProcessor(url, device, headers, modality, marqo_index_type, marqo_index_model, preprocessors, - audio_preprocessing, video_preprocessing) + processor = StreamingMediaProcessor( + url=url, device=device, modality=modality, marqo_index_type=marqo_index_type, marqo_index_model=marqo_index_model, + preprocessors=preprocessors, audio_preprocessing=audio_preprocessing, video_preprocessing=video_preprocessing, + media_download_headers=media_download_headers + ) if processor.total_size > MAX_FILE_SIZE: raise ValueError( diff --git a/src/marqo/tensor_search/streaming_media_processor.py b/src/marqo/tensor_search/streaming_media_processor.py index 72b75de3c..a972739d7 100644 --- a/src/marqo/tensor_search/streaming_media_processor.py +++ b/src/marqo/tensor_search/streaming_media_processor.py @@ -18,12 +18,11 @@ class StreamingMediaProcessor: - def __init__(self, url: str, device: str, headers: Dict[str, str], modality: Modality, marqo_index_type: IndexType, + def __init__(self, url: str, device: str, modality: Modality, marqo_index_type: IndexType, marqo_index_model: Model, preprocessors: Preprocessors, audio_preprocessing: AudioPreProcessing = None, - video_preprocessing: VideoPreProcessing = None): + video_preprocessing: VideoPreProcessing = None, media_download_headers: Optional[Dict[str, str] ]= None): self.url = url self.device = device - self.headers = headers self.modality = modality self.marqo_index_type = marqo_index_type self.marqo_index_model = marqo_index_model @@ -33,6 +32,10 @@ def __init__(self, url: str, device: str, headers: Dict[str, str], modality: Mod self.preprocessor = self.preprocessors[modality] self.total_size, self.duration = self._fetch_file_metadata() + if media_download_headers is None: + media_download_headers = {} + self.media_download_headers = media_download_headers + self._set_split_parameters(modality) self._log_initialization_details() @@ -67,6 +70,8 @@ def _fetch_file_metadata(self): 'probesize': '256K' # Probe only the first 256KB } + probe_options.update(self.media_download_headers) + probe = ffmpeg.probe(self.url, **probe_options) size = int(probe['format'].get('size', 0)) @@ -105,7 +110,10 @@ def process_media(self) -> List[Dict[str, torch.Tensor]]: try: # Use ffmpeg-python to process the chunk - stream = ffmpeg.input(self.url, ss=chunk_start, t=chunk_end - chunk_start) + stream = ffmpeg.input( + self.url, ss=chunk_start, t=chunk_end - chunk_start, + headers=self.media_download_headers + ) if self.modality == Modality.VIDEO: stream = ffmpeg.output(stream, output_file, vcodec='libx264', acodec='aac', **{'f': 'mp4'}) From 27da146e516d00d389993044e8425221f87fbc74 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 10:58:35 +1100 Subject: [PATCH 19/29] Fix headers for media --- src/marqo/tensor_search/streaming_media_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/marqo/tensor_search/streaming_media_processor.py b/src/marqo/tensor_search/streaming_media_processor.py index a972739d7..2aa1f4c7f 100644 --- a/src/marqo/tensor_search/streaming_media_processor.py +++ b/src/marqo/tensor_search/streaming_media_processor.py @@ -30,12 +30,12 @@ def __init__(self, url: str, device: str, modality: Modality, marqo_index_type: self.video_preprocessing = video_preprocessing self.preprocessors = preprocessors self.preprocessor = self.preprocessors[modality] - self.total_size, self.duration = self._fetch_file_metadata() - if media_download_headers is None: media_download_headers = {} self.media_download_headers = media_download_headers + self.total_size, self.duration = self._fetch_file_metadata() + self._set_split_parameters(modality) self._log_initialization_details() From c37fe9e1e188f8d97335ceb079ba23ad839e15da Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 10:59:56 +1100 Subject: [PATCH 20/29] Fix headers for media --- src/marqo/tensor_search/streaming_media_processor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/marqo/tensor_search/streaming_media_processor.py b/src/marqo/tensor_search/streaming_media_processor.py index 2aa1f4c7f..786ccbd52 100644 --- a/src/marqo/tensor_search/streaming_media_processor.py +++ b/src/marqo/tensor_search/streaming_media_processor.py @@ -20,7 +20,7 @@ class StreamingMediaProcessor: def __init__(self, url: str, device: str, modality: Modality, marqo_index_type: IndexType, marqo_index_model: Model, preprocessors: Preprocessors, audio_preprocessing: AudioPreProcessing = None, - video_preprocessing: VideoPreProcessing = None, media_download_headers: Optional[Dict[str, str] ]= None): + video_preprocessing: VideoPreProcessing = None, media_download_headers: Optional[Dict[str, str]]= None): self.url = url self.device = device self.modality = modality @@ -67,11 +67,10 @@ def _fetch_file_metadata(self): 'v': 'error', 'show_entries': 'format=size,duration', 'of': 'json', - 'probesize': '256K' # Probe only the first 256KB + 'probesize': '256K', # Probe only the first 256KB + 'headers': self.media_download_headers } - probe_options.update(self.media_download_headers) - probe = ffmpeg.probe(self.url, **probe_options) size = int(probe['format'].get('size', 0)) From a35b1c90d9a760b9379109f126d42f62683b6be3 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 11:14:35 +1100 Subject: [PATCH 21/29] Fix media download headers for video and audio --- .../streaming_media_processor.py | 39 +++++++++++++++---- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/src/marqo/tensor_search/streaming_media_processor.py b/src/marqo/tensor_search/streaming_media_processor.py index 786ccbd52..56d285637 100644 --- a/src/marqo/tensor_search/streaming_media_processor.py +++ b/src/marqo/tensor_search/streaming_media_processor.py @@ -15,6 +15,7 @@ from marqo.core.models.marqo_index import * from marqo.s2_inference.multimodal_model_load import Modality from marqo.tensor_search.models.preprocessors_model import Preprocessors +from marqo.core.exceptions import InternalError class StreamingMediaProcessor: @@ -30,9 +31,7 @@ def __init__(self, url: str, device: str, modality: Modality, marqo_index_type: self.video_preprocessing = video_preprocessing self.preprocessors = preprocessors self.preprocessor = self.preprocessors[modality] - if media_download_headers is None: - media_download_headers = {} - self.media_download_headers = media_download_headers + self.media_download_headers = self._convert_headers_to_cli_format(media_download_headers) self.total_size, self.duration = self._fetch_file_metadata() @@ -59,6 +58,25 @@ def _log_initialization_details(self): # print(f"from StreamingMediaProcessor, self.duration: {self.duration}") pass + def _convert_headers_to_cli_format(self, raw_media_download_headers: Optional[Dict] = None) -> str: + """ + A helper function to convert the media download headers into a format that can be passed to ffmpeg in + subprocess calls. + + Examples: + If the headers are {"key1": "value1", "key2": "value2"}, the function will return a string + "key1: value1\r\nkey2: value2" + + Returns: + str: The headers in the required format. An empty string if no headers or None are provided. + """ + if raw_media_download_headers is None or raw_media_download_headers == {}: + return "" + elif not isinstance(raw_media_download_headers, dict): + raise InternalError("media_download_headers should be a dictionary") + return "\r\n".join([f"{key}: {value}" for key, value in raw_media_download_headers.items()]) + + def _fetch_file_metadata(self): start_time = time.time() @@ -68,9 +86,11 @@ def _fetch_file_metadata(self): 'show_entries': 'format=size,duration', 'of': 'json', 'probesize': '256K', # Probe only the first 256KB - 'headers': self.media_download_headers } + if self.media_download_headers: + probe_options['headers'] = self.media_download_headers + probe = ffmpeg.probe(self.url, **probe_options) size = int(probe['format'].get('size', 0)) @@ -109,10 +129,13 @@ def process_media(self) -> List[Dict[str, torch.Tensor]]: try: # Use ffmpeg-python to process the chunk - stream = ffmpeg.input( - self.url, ss=chunk_start, t=chunk_end - chunk_start, - headers=self.media_download_headers - ) + if self.media_download_headers: + stream = ffmpeg.input( + self.url, ss=chunk_start, t=chunk_end - chunk_start, + headers=self.media_download_headers + ) + else: + stream = ffmpeg.input(self.url, ss=chunk_start, t=chunk_end - chunk_start) if self.modality == Modality.VIDEO: stream = ffmpeg.output(stream, output_file, vcodec='libx264', acodec='aac', **{'f': 'mp4'}) From 532cc4ef0f093f6954239f4603708a3861dd8c93 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 11:20:20 +1100 Subject: [PATCH 22/29] Fix tests --- tests/tensor_search/integ_tests/test_add_documents_combined.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index e7cc77ff2..526ca2c9c 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -1259,7 +1259,7 @@ def test_language_bind_model_can_add_all_private_media_modalities(self): "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark", "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress", - "_id": "1" + "_id": "" } ] for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: From f85ee8b72738915d928217b04b4c6542f059e220 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 11:48:57 +1100 Subject: [PATCH 23/29] Convert image to RGB for languagebind --- .../languagebind/image/processing_image.py | 5 ++++ .../test_add_documents_combined.py | 27 ++++++++++--------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/marqo/s2_inference/languagebind/image/processing_image.py b/src/marqo/s2_inference/languagebind/image/processing_image.py index 7a3d7c396..90f80b155 100644 --- a/src/marqo/s2_inference/languagebind/image/processing_image.py +++ b/src/marqo/s2_inference/languagebind/image/processing_image.py @@ -13,10 +13,15 @@ def make_list_of_images(x): return x +def _convert_to_rgb(image): + return image.convert("RGB") + + def get_image_transform(config): config = config.vision_config transform = transforms.Compose( [ + _convert_to_rgb, transforms.ToTensor(), transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC), transforms.CenterCrop(224), diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 526ca2c9c..b5498b67f 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -1247,22 +1247,23 @@ def test_language_bind_model_can_add_all_media_modalities(self): def test_language_bind_model_can_add_all_private_media_modalities(self): documents = [ { # With extensions - "text_field_1": "This is a test text", + #"text_field_1": "This is a test text", "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", - "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark.wav", - "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4", - "_id": "1" + # "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark.wav", + # "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4", + # "_id": "1" }, - { - # No extensions - "text_field_1": "This is a test text", - "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", - "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark", - "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress", - "_id": "" - } + # { + # # No extensions + # "text_field_1": "This is a test text", + # "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", + # "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark", + # "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress", + # "_id": "2" + # } ] - for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: + # for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: + for index_name in [self.structured_language_bind_index_name]: tensor_fields = ["text_field_1", "image_field_1", "audio_field_1", "video_field_1", "multimodal_field"] \ if index_name == self.unstructured_language_bind_index_name else None with self.subTest(index_name): From bace6fde6a53b4f1ef3c2c4833054bfcf669d182 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 12:16:27 +1100 Subject: [PATCH 24/29] Fix tests --- .../s2_inference/multimodal_model_load.py | 2 +- .../test_add_documents_combined.py | 27 +++++++++---------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/marqo/s2_inference/multimodal_model_load.py b/src/marqo/s2_inference/multimodal_model_load.py index ad4bb2506..593c45bf5 100644 --- a/src/marqo/s2_inference/multimodal_model_load.py +++ b/src/marqo/s2_inference/multimodal_model_load.py @@ -292,7 +292,7 @@ def encode(self, content, modality, normalize=True, media_download_headers: Opti # If media has already been preprocessed inputs[modality.value] = to_device(content[0], self.model.device)['pixel_values'] elif isinstance(content[0], str) and 'http' in content[0]: - return self.encode(content[0], modality=modality) + return self.encode(content[0], modality=modality, media_download_headers=media_download_headers) else: raise ValueError(f"Unsupported {modality.value} content type: {type(content)}, content: {content}") diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index b5498b67f..7201ff4ab 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -1247,23 +1247,22 @@ def test_language_bind_model_can_add_all_media_modalities(self): def test_language_bind_model_can_add_all_private_media_modalities(self): documents = [ { # With extensions - #"text_field_1": "This is a test text", + "text_field_1": "This is a test text", "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", - # "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark.wav", - # "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4", - # "_id": "1" + "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark.wav", + "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4", + "_id": "1" }, - # { - # # No extensions - # "text_field_1": "This is a test text", - # "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", - # "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark", - # "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress", - # "_id": "2" - # } + { + # No extensions + "text_field_1": "This is a test text", + "image_field_1": "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", + "audio_field_1": "https://d2k91vq0avo7lq.cloudfront.net/bark", + "video_field_1": "https://d2k91vq0avo7lq.cloudfront.net/congress", + "_id": "2" + } ] - # for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: - for index_name in [self.structured_language_bind_index_name]: + for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: tensor_fields = ["text_field_1", "image_field_1", "audio_field_1", "video_field_1", "multimodal_field"] \ if index_name == self.unstructured_language_bind_index_name else None with self.subTest(index_name): From 72e4b366e634a1b0b16942a53069e4bf1c3dfbaa Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 12:33:38 +1100 Subject: [PATCH 25/29] Delete a test --- .../integ_tests/test_search_combined.py | 46 ------------------- 1 file changed, 46 deletions(-) diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py index f7e826860..fb8fa9d4b 100644 --- a/tests/tensor_search/integ_tests/test_search_combined.py +++ b/tests/tensor_search/integ_tests/test_search_combined.py @@ -1012,52 +1012,6 @@ def test_search_over_private_images_with_media_download_headers(self): media_download_headers={"marqo_media_header": "media_header_test_key"} ) - def test_lexical_search_DoesNotErrorWithEscapedQuotes(self): - """ - Ensure that lexical search handles double quotes properly, both escaped and wrong quotes. - Expected behavior: escaped quotes are passed to vespa. Incorrect quotes are treated like whitespace. - """ - - docs_list = [ - {"_id": "doc1", "text_field_1": '1"2'}, - {"_id": "doc2", "text_field_1": 'exact match'}, - {"_id": "doc3", "text_field_1": 'exacto wrong syntax'}, - {"_id": "doc4", "text_field_1": '"escaped"'}, - - {"_id": "red_herring_1", "text_field_1": '12'}, - {"_id": "red_herring_2", "text_field_1": 'escaped'}, - {"_id": "red_herring_3", "text_field_1": 'wrong"'} - ] - test_cases = [ - ('1\\"2', ['doc1']), # Match off of '1"2' - ('"exact match"', ['doc2']), # Match off of 'exact match' - ('\\"escaped\\"', ['doc4', 'red_herring_2']), # Match off of 'escaped' or '"escaped"' - ('"exacto" wrong"', ['doc3']), # Match properly off of 'wrong' - ('""', []), # Single quote should return no results (treated as whitespace) - ('"', []), # Double quote should return no results (treated as whitespace) - ('', []) # Empty string should return no results - ] - - for index in [self.unstructured_default_text_index, self.structured_default_text_index]: - with self.subTest(index=index.type): - tensor_search.add_documents( - config=self.config, - add_docs_params=AddDocsParams( - index_name=index.name, - docs=docs_list, - tensor_fields=["text_field_1"] if isinstance(index, UnstructuredMarqoIndex) else None - ) - ) - - for query, expected_ids in test_cases: - with self.subTest(query=query): - res = tensor_search.search( - text=query, config=self.config, index_name=index.name, - search_method=SearchMethod.LEXICAL - ) - self.assertEqual(len(expected_ids), len(res['hits'])) - self.assertEqual(set(expected_ids), {hit['_id'] for hit in res['hits']}) - @pytest.mark.largemodel class TestLanguageBindModelSearchCombined(MarqoTestCase): From 96e22311d50888f5b61781248e22252f1ced0fe6 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 12:35:03 +1100 Subject: [PATCH 26/29] Add back the test --- .../integ_tests/test_search_combined.py | 195 +++++------------- 1 file changed, 48 insertions(+), 147 deletions(-) diff --git a/tests/tensor_search/integ_tests/test_search_combined.py b/tests/tensor_search/integ_tests/test_search_combined.py index fb8fa9d4b..514a92e99 100644 --- a/tests/tensor_search/integ_tests/test_search_combined.py +++ b/tests/tensor_search/integ_tests/test_search_combined.py @@ -1,24 +1,23 @@ import os import uuid from unittest import mock - -import pytest import torch +import pytest import marqo.core.exceptions as core_exceptions -from marqo import exceptions as base_exceptions -from marqo.core.models.add_docs_params import AddDocsParams from marqo.core.models.marqo_index import * from marqo.core.models.marqo_index_request import FieldRequest +from marqo.tensor_search import tensor_search +from marqo.tensor_search.enums import SearchMethod +from marqo.core.models.add_docs_params import AddDocsParams +from tests.marqo_test import MarqoTestCase, TestImageUrls +from marqo import exceptions as base_exceptions from marqo.core.models.marqo_query import MarqoLexicalQuery from marqo.core.models.score_modifier import ScoreModifierType, ScoreModifier from marqo.core.structured_vespa_index.structured_vespa_index import StructuredVespaIndex from marqo.core.unstructured_vespa_index.unstructured_vespa_index import UnstructuredVespaIndex -from marqo.s2_inference.errors import MediaDownloadError -from marqo.tensor_search import tensor_search -from marqo.tensor_search.enums import SearchMethod from marqo.tensor_search.models.api_models import SearchQuery -from tests.marqo_test import MarqoTestCase, TestImageUrls, TestAudioUrls, TestVideoUrls +from pydantic import ValidationError class TestSearch(MarqoTestCase): @@ -205,7 +204,7 @@ def test_search_video(self): documents = [ {"video_field_1": "https://marqo-k400-video-test-dataset.s3.amazonaws.com/videos/---QUuC4vJs_000084_000094.mp4", "_id": "1"}, # Replace the audio link with something marqo-hosted - {"audio_field_1": "https://marqo-ecs-50-audio-test-dataset.s3.amazonaws.com/audios/marqo-audio-test.mp3", "_id": "2"}, + {"audio_field_1": "https://marqo-ecs-50-audio-test-dataset.s3.amazonaws.com/audios/marqo-audio-test.mp3", "_id": "2"}, {"image_field_1": TestImageUrls.HIPPO_REALISTIC_LARGE.value, "_id": "3"}, # {"image_field_1": TestImageUrls.HIPPO_REALISTIC.value, "_id": "5"}, # png image with palette is not supported {"text_field_1": "hello there padawan. Today you will begin your training to be a Jedi", "_id": "4"}, @@ -240,7 +239,7 @@ def test_search_audio(self): documents = [ {"video_field_1": "https://marqo-k400-video-test-dataset.s3.amazonaws.com/videos/---QUuC4vJs_000084_000094.mp4", "_id": "1"}, # Replace the audio link with something marqo-hosted - {"audio_field_1": "https://marqo-ecs-50-audio-test-dataset.s3.amazonaws.com/audios/marqo-audio-test.mp3", "_id": "2"}, + {"audio_field_1": "https://marqo-ecs-50-audio-test-dataset.s3.amazonaws.com/audios/marqo-audio-test.mp3", "_id": "2"}, {"image_field_1": TestImageUrls.HIPPO_REALISTIC_LARGE.value, "_id": "3"}, # {"image_field_1": TestImageUrls.HIPPO_REALISTIC.value, "_id": "5"}, # png file with palette is not supported {"text_field_1": "hello there padawan. Today you will begin your training to be a Jedi", "_id": "4"}, @@ -263,7 +262,7 @@ def test_search_audio(self): index_name=index.name, text="https://marqo-ecs-50-audio-test-dataset.s3.amazonaws.com/audios/marqo-audio-test.mp3" ) - + # Assertions self.assertEqual(len(results['hits']), 3) # 3 documents should be returned (limit=3) self.assertEqual(results['hits'][0]['_id'], "2") # The audio document should be the top result @@ -968,146 +967,48 @@ def test_search_query_CanAcceptDifferentSearchMethods(self): search_query = SearchQuery(q="test") self.assertEqual(SearchMethod.TENSOR, search_query.searchMethod) - def test_search_private_images_proper_error_raised(self): - """Test that search raises a MediaDownloadError when trying to access private images""" - test_indexes = [ - self.unstructured_default_image_index, - self.structured_default_image_index - ] + def test_lexical_search_DoesNotErrorWithEscapedQuotes(self): + """ + Ensure that lexical search handles double quotes properly, both escaped and wrong quotes. + Expected behavior: escaped quotes are passed to vespa. Incorrect quotes are treated like whitespace. + """ - test_queries = [({ - "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png": 1, - "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small": 1 }, "dictionary queries"), - ("https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", "str queries")] - for index_name in test_indexes: - for query, msg in test_queries: - with self.subTest(msg=f"index: {index_name}, query: {msg}"): - with self.assertRaises(MediaDownloadError): - _ = tensor_search.search( - config=self.config, - index_name=index_name.name, - text=query, - search_method=SearchMethod.TENSOR, - ) + docs_list = [ + {"_id": "doc1", "text_field_1": '1"2'}, + {"_id": "doc2", "text_field_1": 'exact match'}, + {"_id": "doc3", "text_field_1": 'exacto wrong syntax'}, + {"_id": "doc4", "text_field_1": '"escaped"'}, - def test_search_over_private_images_with_media_download_headers(self): - """Test that search can use private images with media download headers""" - test_indexes = [ - self.unstructured_default_image_index, - self.structured_default_image_index + {"_id": "red_herring_1", "text_field_1": '12'}, + {"_id": "red_herring_2", "text_field_1": 'escaped'}, + {"_id": "red_herring_3", "text_field_1": 'wrong"'} + ] + test_cases = [ + ('1\\"2', ['doc1']), # Match off of '1"2' + ('"exact match"', ['doc2']), # Match off of 'exact match' + ('\\"escaped\\"', ['doc4', 'red_herring_2']), # Match off of 'escaped' or '"escaped"' + ('"exacto" wrong"', ['doc3']), # Match properly off of 'wrong' + ('""', []), # Single quote should return no results (treated as whitespace) + ('"', []), # Double quote should return no results (treated as whitespace) + ('', []) # Empty string should return no results ] - test_queries = [({ - "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png": 1, - "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small": 1 }, "dictionary queries"), - ("https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small", "str queries")] - for index_name in test_indexes: - for query, msg in test_queries: - with self.subTest(msg=f"index: {index_name}, query: {msg}"): - _ = tensor_search.search( - config=self.config, - index_name=index_name.name, - text=query, - search_method=SearchMethod.TENSOR, - media_download_headers={"marqo_media_header": "media_header_test_key"} + for index in [self.unstructured_default_text_index, self.structured_default_text_index]: + with self.subTest(index=index.type): + tensor_search.add_documents( + config=self.config, + add_docs_params=AddDocsParams( + index_name=index.name, + docs=docs_list, + tensor_fields=["text_field_1"] if isinstance(index, UnstructuredMarqoIndex) else None ) - - -@pytest.mark.largemodel -class TestLanguageBindModelSearchCombined(MarqoTestCase): - """A class to test the search with the LanguageBind model.""" - - @classmethod - def setUpClass(cls) -> None: - super().setUpClass() - - structured_language_bind_index = cls.structured_marqo_index_request( - name="structured_image_index" + str(uuid.uuid4()).replace('-', ''), - fields=[ - FieldRequest(name="text_field_1", type=FieldType.Text, - features=[FieldFeature.Filter, FieldFeature.LexicalSearch]), - FieldRequest(name="image_field_1", type=FieldType.ImagePointer), - FieldRequest(name="audio_field_1", type=FieldType.AudioPointer), - FieldRequest(name="video_field_1", type=FieldType.VideoPointer), - FieldRequest( - name="multimodal_field", - type=FieldType.MultimodalCombination, - dependent_fields={ - "image_field_1": 1.0, - "text_field_1": 1.0, - "audio_field_1": 1.0, - "video_field_1": 1.0, - } ) - ], - model=Model(name="LanguageBind/Video_V1.5_FT_Audio_FT_Image"), - tensor_fields=["text_field_1", "image_field_1", "audio_field_1", "video_field_1", "multimodal_field"], - ) - unstructured_language_bind_index = cls.unstructured_marqo_index_request( - name="unstructured_image_index" + str(uuid.uuid4()).replace('-', ''), - model=Model(name="LanguageBind/Video_V1.5_FT_Audio_FT_Image"), - treat_urls_and_pointers_as_images=True, - treat_urls_and_pointers_as_media=True - ) - - cls.indexes = cls.create_indexes([structured_language_bind_index, unstructured_language_bind_index]) - - cls.structured_language_bind_index_name = structured_language_bind_index.name - cls.unstructured_language_bind_index_name = unstructured_language_bind_index.name - - s2_inference.clear_loaded_models() - - @classmethod - def tearDownClass(cls) -> None: - super().tearDownClass() - s2_inference.clear_loaded_models() - - def test_language_bind_model_can_search_all_media_modalities(self): - """Test to ensure that the LanguageBind model can search all media types to the index""" - queries = [ - "This is a test text", - TestImageUrls.IMAGE1.value, - TestAudioUrls.AUDIO1.value, - TestVideoUrls.VIDEO1.value, - { - "This is a test text": 1, - TestImageUrls.IMAGE1.value: 1, - TestAudioUrls.AUDIO1.value: 1, - TestVideoUrls.VIDEO1.value: 1 - } - ] - for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: - for query in queries: - with self.subTest(index_name): - _ = tensor_search.search( - config = self.config, - index_name=index_name, - text=query, - search_method=SearchMethod.TENSOR - ) - - def test_language_bind_model_can_search_all_private_media_modalities(self): - """A test to ensure that the LanguageBind model can search all private media types to the index""" - queries = [ - "This is a test text", - "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png", - "https://d2k91vq0avo7lq.cloudfront.net/bark.wav", - "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4", - { - "This is a test text": 1, - "https://d2k91vq0avo7lq.cloudfront.net/ai_hippo_realistic_small.png": 1, - "https://d2k91vq0avo7lq.cloudfront.net/bark.wav": 1, - "https://d2k91vq0avo7lq.cloudfront.net/congress.mp4": 1 - } - ] - for index_name in [self.structured_language_bind_index_name, self.unstructured_language_bind_index_name]: - for query in queries: - with self.subTest(index_name): - _ = tensor_search.search( - config = self.config, - index_name=index_name, - text=query, - search_method=SearchMethod.TENSOR, - media_download_headers={"marqo_media_header": "media_header_test_key"} - ) \ No newline at end of file + for query, expected_ids in test_cases: + with self.subTest(query=query): + res = tensor_search.search( + text=query, config=self.config, index_name=index.name, + search_method=SearchMethod.LEXICAL + ) + self.assertEqual(len(expected_ids), len(res['hits'])) + self.assertEqual(set(expected_ids), {hit['_id'] for hit in res['hits']}) From 2353e6a68daeb5d59a785153a0e4385896f6426b Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 12:56:04 +1100 Subject: [PATCH 27/29] Change largemodel tests logic --- tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 36d1b9617..93d52e8ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,9 +18,9 @@ def pytest_collection_modifyitems(config, items): skip_cpu_only = pytest.mark.skip(reason="skip in --largemodel mode when cpu_only is present") if config.getoption("--largemodel"): - # --largemodel given in cli: do not skip largemodel tests, skip cpu_only tests + # --largemodel given in cli: only run tests that have largemodel marker for item in items: - if "cpu_only" in item.keywords: + if "largemodel" not in item.keywords: item.add_marker(skip_cpu_only) else: for item in items: From 573b46907517556b6299e651986d3f49a9c7b8a4 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 13:48:07 +1100 Subject: [PATCH 28/29] Fix tests --- .../tensor_search/test_modalities_download.py | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/tests/tensor_search/test_modalities_download.py b/tests/tensor_search/test_modalities_download.py index b7158b2be..0335d2a48 100644 --- a/tests/tensor_search/test_modalities_download.py +++ b/tests/tensor_search/test_modalities_download.py @@ -1,17 +1,21 @@ import unittest from unittest.mock import Mock, patch, MagicMock -from PIL import UnidentifiedImageError + +import ffmpeg +import pytest import torch -from marqo.s2_inference.errors import UnsupportedModalityError, S2InferenceError -from marqo.tensor_search.add_docs import threaded_download_and_preprocess_content +from PIL import UnidentifiedImageError + from marqo.core.models.marqo_index import IndexType, MarqoIndex, FieldType -from marqo.s2_inference.s2_inference import Modality -from marqo.s2_inference.models.model_type import ModelType -from marqo.tensor_search.telemetry import RequestMetricsStore, RequestMetrics from marqo.s2_inference.errors import MediaDownloadError -import ffmpeg +from marqo.s2_inference.errors import UnsupportedModalityError, S2InferenceError +from marqo.s2_inference.models.model_type import ModelType +from marqo.s2_inference.s2_inference import Modality +from marqo.tensor_search.add_docs import threaded_download_and_preprocess_content +from marqo.tensor_search.telemetry import RequestMetrics +@pytest.mark.unittest class TestThreadedDownloadAndPreprocess(unittest.TestCase): def setUp(self): @@ -230,13 +234,30 @@ def test_video_and_audio_unstructured_index(self, mock_infer_modality, mock_down # Verify that download_and_chunk_media was called twice self.assertEqual(mock_download_and_chunk.call_count, 2) + print(mock_download_and_chunk.call_args_list) # Verify the calls to download_and_chunk_media mock_download_and_chunk.assert_any_call( - self.mock_video_url, "cpu", {}, Modality.VIDEO, self.mock_marqo_index.type, self.mock_marqo_index.model, None, None, None + url=self.mock_video_url, + device='cpu', + modality= Modality.VIDEO, + marqo_index_type = self.mock_marqo_index.type, + marqo_index_model = self.mock_marqo_index.model, + preprocessors = None, + audio_preprocessing = None, + video_preprocessing = None, + media_download_headers = {} ) mock_download_and_chunk.assert_any_call( - self.mock_audio_url, "cpu", {}, Modality.AUDIO, self.mock_marqo_index.type, self.mock_marqo_index.model, None, None, None + url=self.mock_video_url, + device='cpu', + modality= Modality.VIDEO, + marqo_index_type = self.mock_marqo_index.type, + marqo_index_model = self.mock_marqo_index.model, + preprocessors = None, + audio_preprocessing = None, + video_preprocessing = None, + media_download_headers = {} ) @patch("marqo.tensor_search.add_docs.download_and_chunk_media") From a5ea94788cf671f30e917b77d5c404702df3eb53 Mon Sep 17 00:00:00 2001 From: Li Wan Date: Fri, 25 Oct 2024 13:50:37 +1100 Subject: [PATCH 29/29] Fix tests --- .../integ_tests/test_add_documents_combined.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 7201ff4ab..64df57273 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -833,13 +833,13 @@ def test_process_media_chunk_calculation(self, mock_temp_dir, mock_ffmpeg): processor = streaming_media_processor.StreamingMediaProcessor( url='http://example.com/video.mp4', device='cpu', - headers={}, modality=streaming_media_processor.Modality.VIDEO, marqo_index_type=IndexType.Unstructured, marqo_index_model=Model(name="test", properties={}), audio_preprocessing=unittest.mock.Mock(), video_preprocessing=unittest.mock.Mock(), - preprocessors={'video': unittest.mock.Mock()} + preprocessors={'video': unittest.mock.Mock()}, + media_download_headers={}, ) # Set arbitrary values @@ -1167,8 +1167,6 @@ def test_add_private_images_success(self): self.assertFalse(res.errors) - - @pytest.mark.largemodel class TestLanguageBindModelAddDocumentCombined(MarqoTestCase): """A class to test the add_documents with the LanguageBind model."""