marqo-ai · pandu-k · May 11, 2023 · May 3, 2023 · May 4, 2023 · May 5, 2023
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,8 @@ pytest
 tox
 # s2_inference:
 more_itertools
+boto3==1.25.4
+botocore==1.28.4
 nltk==3.7
 torch==1.12.1
 torchvision==0.13.1

diff --git a/src/marqo/README.md b/src/marqo/README.md
@@ -237,3 +237,8 @@ curl http://localhost:8882/openapi.json
 ```
 To get the human readable spec, visit `http://localhost:8882/docs`
 
+## IDE tips
+
+## PyCharm
+Pydantic dataclasses are used in this project. By default, PyCharm can't parse initialisations of these dataclasses. 
+[This plugin](https://plugins.jetbrains.com/plugin/12861-pydantic) can help.
diff --git a/src/marqo/s2_inference/clip_utils.py b/src/marqo/s2_inference/clip_utils.py
@@ -1,7 +1,6 @@
-# from torch import FloatTensor
-# from typing import Any, Dict, List, Optional, Union
 import os
-import PIL.Image
+from marqo.tensor_search.enums import ModelProperties, InferenceParams
+from marqo.tensor_search.models.private_models import ModelLocation, ModelAuth
 import validators
 import requests
 import numpy as np
@@ -15,7 +14,7 @@
 from marqo.s2_inference.logger import get_logger
 from marqo.s2_inference.errors import IncompatibleModelDeviceError, InvalidModelPropertiesError
 from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-from marqo.s2_inference.processing.custom_clip_utils import HFTokenizer, download_pretrained_from_url
+from marqo.s2_inference.processing.custom_clip_utils import HFTokenizer, download_model
 from torchvision.transforms import InterpolationMode
 from marqo.s2_inference.configs import ModelCache
 
@@ -205,22 +204,63 @@ def __init__(self, model_type: str = "ViT-B/32", device: str = 'cpu',  embedding
         self.truncate = truncate
         self.model_properties = kwargs.get("model_properties", dict())
 
+        # model_auth gets passed through add_docs and search requests:
+        model_auth = kwargs.get(InferenceParams.model_auth, None)
+        if model_auth is not None:
+            self.model_auth = model_auth
+        else:
+            self.model_auth = None
+
+    def _download_from_repo(self):
+        """Downloads model from an external repo like s3 and returns the filepath
+
+        Returns:
+            The model's filepath
+
+        Raises:
+            RunTimeError if an empty filepath is detected. This is important
+                because OpenCLIP will instantiate a model with random weights, if
+                a filepath isn't specified, and the model isn't a publicly
+                available HF or OpenAI one.
+        """
+        model_location = ModelLocation(**self.model_properties[ModelProperties.model_location])
+        download_model_params = {"repo_location": model_location}
+
+        if model_location.auth_required:
+            download_model_params['auth'] = self.model_auth
+
+        model_file_path = download_model(**download_model_params)
+        if model_file_path is None or model_file_path == '':
+            raise RuntimeError(
+                'download_model() needs to return a valid filepath to the model! Instead, received '
+                f' filepath `{model_file_path}`')
+        return model_file_path
+
     def load(self) -> None:
 
+        model_location_presence = ModelProperties.model_location in self.model_properties
+
         path = self.model_properties.get("localpath", None) or self.model_properties.get("url",None)
 
-        if path is None:
+        if path is None and not model_location_presence:
             # The original method to load the openai clip model
             # https://github.com/openai/CLIP/issues/30
             self.model, self.preprocess = clip.load(self.model_type, device='cpu', jit=False, download_root=ModelCache.clip_cache_path)
             self.model = self.model.to(self.device)
             self.tokenizer = clip.tokenize
         else:
             logger.info("Detecting custom clip model path. We use generic clip model loading.")
-            if os.path.isfile(path):
+            if path and model_location_presence:
+                raise InvalidModelPropertiesError(
+                    "Only one of `url`, `localpath` or `model_location can be specified in "
+                    "model_properties`. Please ensure that only one of these is specified in "
+                    "model_properties and retry.")
+            if model_location_presence:
+                self.model_path = self._download_from_repo()
+            elif os.path.isfile(path):
                 self.model_path = path
             elif validators.url(path):
-                self.model_path = download_pretrained_from_url(path)
+                self.model_path = download_model(url=path)
             else:
                 raise InvalidModelPropertiesError(f"Marqo can not load the custom clip model."
                                                   f"The provided model path `{path}` is neither a local file nor a valid url."
@@ -356,23 +396,33 @@ def load(self) -> None:
         # https://github.com/mlfoundations/open_clip
         path = self.model_properties.get("localpath", None) or self.model_properties.get("url", None)
 
-        if path is None:
+        model_location_presence = ModelProperties.model_location in self.model_properties
+
+        if path is None and not model_location_presence:
             self.model, _, self.preprocess = open_clip.create_model_and_transforms(self.model_name,
                                                                                    pretrained=self.pretrained,
                                                                                    device=self.device, jit=False, cache_dir=ModelCache.clip_cache_path)
             self.tokenizer = open_clip.get_tokenizer(self.model_name)
             self.model.eval()
         else:
+            if path and model_location_presence:
+                raise InvalidModelPropertiesError(
+                    "Only one of `url`, `localpath` or `model_location can be specified in "
+                    "model_properties`. Please ensure that only one of these is specified in "
+                    "model_properties and retry.")
             logger.info("Detecting custom clip model path. We use generic clip model loading.")
-            if os.path.isfile(path):
+            if model_location_presence:
+                self.model_path = self._download_from_repo()
+            elif os.path.isfile(path):
                 self.model_path = path
             elif validators.url(path):
-                self.model_path = download_pretrained_from_url(path)
+                self.model_path = download_model(url=path)
             else:
-                raise InvalidModelPropertiesError(f"Marqo can not load the custom clip model."
-                                                  f"The provided model path `{path}` is neither a local file nor a valid url."
-                                                  f"Please check your provided model url and retry."
-                                                  f"Check `https://docs.marqo.ai/0.0.13/Models-Reference/dense_retrieval/#generic-clip-models` for more info.")
+                raise InvalidModelPropertiesError(
+                    f"Marqo cannot load the custom clip model. "
+                    f"The provided model path `{path}` is neither a local file nor a valid url. "
+                    f"Please check your provided model url and retry. "
+                    f"Check `https://docs.marqo.ai/0.0.13/Models-Reference/dense_retrieval/#generic-clip-models` for more info.")
 
             self.precision = self.model_properties.get("precision", "fp32")
             self.jit = self.model_properties.get("jit", False)
@@ -384,14 +434,13 @@ def load(self) -> None:
 
             self.model.eval()
 
-
     def custom_clip_load(self):
         self.model_name = self.model_properties.get("name", None)
 
-
         logger.info(f"The name of the custom clip model is {self.model_name}. We use open_clip load")
-        model, _, preprocess = open_clip.create_model_and_transforms(model_name=self.model_name, jit = self.jit, pretrained=self.model_path, precision = self.precision,
-                                                                     image_mean=self.mean, image_std=self.std, device = self.device, cache_dir=ModelCache.clip_cache_path)
+        model, _, preprocess = open_clip.create_model_and_transforms(
+            model_name=self.model_name, jit = self.jit, pretrained=self.model_path, precision = self.precision,
+            image_mean=self.mean, image_std=self.std, device = self.device, cache_dir=ModelCache.clip_cache_path)
 
         return model, preprocess
 

diff --git a/src/marqo/s2_inference/model_downloading/__init__.py b/src/marqo/s2_inference/model_downloading/__init__.py
diff --git a/src/marqo/s2_inference/model_downloading/from_hf.py b/src/marqo/s2_inference/model_downloading/from_hf.py
@@ -0,0 +1,48 @@
+from marqo.tensor_search.models.external_apis.hf import HfAuth, HfModelLocation
+from typing import Optional
+from huggingface_hub import hf_hub_download
+from marqo.s2_inference.logger import get_logger
+from huggingface_hub.utils._errors import RepositoryNotFoundError
+from marqo.s2_inference.errors import ModelDownloadError
+
+logger = get_logger(__name__)
+
+
+def download_model_from_hf(
+        location: HfModelLocation,
+        auth: Optional[HfAuth] = None,
+        download_dir: Optional[str] = None):
+    """Downloads a pretrained model from HF, if it doesn't exist locally. The basename of the
+    location's filename is used as the local filename.
+
+    hf_hub_download downloads the model if it does not yet exist in the cache.
+
+    Args:
+        location: repo_id and filename to be downloaded.
+        auth: contains HF API token for model access
+        download_dir: [not yet implemented]. The location where the model
+            should be stored
+
+    Returns:
+        Path to the downloaded model
+    """
+    if download_dir is not None:
+        logger.warning(
+            "Hugging Face model download was given the `download_dir` argument, "
+            "even though it is not yet implemented. "
+            "The specified model will be downloaded but the `download_dir` "
+            "parameter will be ignored."
+        )
+    download_kwargs = location.dict()
+    if auth is not None:
+        download_kwargs = {**download_kwargs, **auth.dict()}
+    try:
+        return hf_hub_download(**download_kwargs)
+    except RepositoryNotFoundError:
+        # TODO: add link to HF model auth/loc
+        raise ModelDownloadError(
+            "Could not find the specified Hugging Face model repository. Please ensure that the request's model_auth's "
+            "`hf` credentials and the index's model_location are correct. "
+            "If the index's model_location is not correct, please create a new index with the corrected model_location"
+        )
+
diff --git a/src/marqo/s2_inference/model_downloading/from_s3.py b/src/marqo/s2_inference/model_downloading/from_s3.py
@@ -0,0 +1,74 @@
+import os
+from marqo.s2_inference.configs import ModelCache
+from marqo.tensor_search.models.external_apis.s3 import S3Auth, S3Location
+from typing import Optional
+import boto3
+from marqo.s2_inference.errors import ModelDownloadError
+from botocore.exceptions import NoCredentialsError
+
+
+def get_presigned_s3_url(location: S3Location, auth: Optional[S3Auth] = None):
+    """Returns the s3 url of a request to get an S3 object
+
+    Args:
+        location: Bucket and key of model file to be downloaded
+        auth: AWS IAM access keys to a user with access to the model to be downloaded
+
+    Returns:
+        The the presigned s3 URL
+
+    TODO: add link to proper usage in error messages
+    """
+    if auth is None:
+        raise ModelDownloadError(
+            "Error retrieving private model. s3 authorisation information is required to "
+            "download a model from an s3 bucket. "
+            "If the model is publicly accessible, please use the model's publicly accessible URL."
+        )
+    s3_client = boto3.client('s3', **auth.dict())
+    try:
+        return s3_client.generate_presigned_url('get_object', Params=location.dict())
+    except NoCredentialsError:
+        raise ModelDownloadError(
+            "Error retrieving private model. AWS credentials were not accepted."
+        )
+
+
+def get_s3_model_absolute_cache_path(location: S3Location) -> str:
+    """Returns the absolute path of an s3 model if it were downloaded.
+
+        Args:
+            location: Bucket and key of model file to be downloaded
+
+        Returns:
+            The absolute path of an s3 model if it were downloaded.
+    """
+    cache_dir = os.path.expanduser(ModelCache.clip_cache_path)
+    return os.path.join(cache_dir, get_s3_model_cache_filename(location))
+
+
+def check_s3_model_already_exists(location: S3Location) -> bool:
+    """Returns True iff an s3 model is already downloaded
+
+        Args:
+            location: Bucket and key of model file to be downloaded
+
+        Returns:
+            The model cache filename of an s3 object
+    """
+    abs_path = get_s3_model_absolute_cache_path(location)
+    return os.path.isfile(abs_path)
+
+
+def get_s3_model_cache_filename(location: S3Location) -> str:
+    """Returns the model cache filename of an s3 object
+
+    Args:
+        location: Bucket and key of model file to be downloaded
+
+    Returns:
+        The model cache filename of an s3 object
+    """
+    return os.path.basename(location.Key)
+
+