marqo-ai · wanliAlex · Oct 10, 2022 · Oct 7, 2022 · Oct 7, 2022 · Oct 10, 2022
diff --git a/requirements.txt b/requirements.txt
@@ -14,6 +14,7 @@ pillow
 numpy
 validators
 sentence-transformers
+open_clip_torch
 clip-marqo
 protobuf==3.20.1
 onnx

diff --git a/setup.py b/setup.py
@@ -17,6 +17,7 @@
         "more_itertools",
         "nltk",
         "torch",
+        "open_clip_torch",
         "pillow",
         "numpy",
         "validators",

diff --git a/src/marqo/s2_inference/clip_utils.py b/src/marqo/s2_inference/clip_utils.py
@@ -7,6 +7,7 @@
 import clip
 import torch
 from PIL import Image
+import open_clip
 
 from marqo.s2_inference.types import *
 from marqo.s2_inference.logger import get_logger
@@ -231,4 +232,37 @@ def encode(self, inputs: Union[str, ImageType, List[Union[str, ImageType]]],
         else:
             logger.debug('text')
             return self.encode_text(inputs, normalize=normalize)
+
+class OPEN_CLIP(CLIP):
+    def __init__(self, model_type: str = "open_clip/ViT-B-32-quickgelu/laion400m_e32", device: str = 'cpu',  embedding_dim: int = None,
+                            truncate: bool = True, **kwargs) -> None:
+        super().__init__(model_type, device,  embedding_dim, truncate , **kwargs)
+        self.model_name = model_type.split("/", 3)[1]
+        self.pretrained = model_type.split("/", 3)[2]
+
+
+    def load(self) -> None:
+        # https://github.com/mlfoundations/open_clip
+        self.model, _, self.preprocess = open_clip.create_model_and_transforms(self.model_name, pretrained = self.pretrained, device=self.device, jit=False)
+        self.tokenizer = open_clip.tokenize
+        self.model.eval()
+
+    def encode_text(self, sentence: Union[str, List[str]], normalize=True) -> FloatTensor:
+
+        if self.model is None:
+            self.load()
+
+        text = self.tokenizer(sentence).to(self.device)
+
+        with torch.no_grad():
+            outputs = self.model.encode_text(text)
+
+        if normalize:
+            _shape_before = outputs.shape
+            outputs /= self.normalize(outputs)
+            assert outputs.shape == _shape_before
+
+        return self._convert_output(outputs)
+
+
 
diff --git a/src/marqo/s2_inference/model_registry.py b/src/marqo/s2_inference/model_registry.py
@@ -2,7 +2,7 @@
 from marqo.s2_inference.sbert_onnx_utils import SBERT_ONNX
 from marqo.s2_inference.sbert_utils import SBERT, TEST
 from marqo.s2_inference.random_utils import Random
-from marqo.s2_inference.clip_utils import CLIP
+from marqo.s2_inference.clip_utils import CLIP, OPEN_CLIP
 from marqo.s2_inference.types import Any, Dict, List, Optional, Union, FloatTensor
 
 # we need to keep track of the embed dim and model load functions/classes
@@ -66,6 +66,226 @@ def _get_clip_properties() -> Dict:
 
         }
     return CLIP_MODEL_PROPERTIES
+def _get_open_clip_properties() -> Dict:
+
+    #use this link to find all the model_configs
+    #https://github.com/mlfoundations/open_clip/tree/main/src/open_clip/model_configs
+
+    OPEN_CLIP_MODEL_PROPERTIES = {
+        'open_clip/RN50/openai':
+           {'name': 'open_clip/RN50/openai',
+            'dimensions': 1024,
+            'note': 'clip model from open_clip implementation',
+            'type': 'open_clip',
+            'pretrained': 'openai'},
+
+        'open_clip/RN50/yfcc15m':
+            {'name': 'open_clip/RN50/yfcc15m',
+             'dimensions': 1024, 'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'yfcc15m'},
+
+        'open_clip/RN50/cc12m':
+            {'name': 'open_clip/RN50/cc12m',
+             'dimensions': 1024,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'cc12m'},
+
+        'open_clip/RN50-quickgelu/openai':
+            {'name': 'open_clip/RN50-quickgelu/openai',
+             'dimensions': 1024,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'openai'},
+
+        'open_clip/RN50-quickgelu/yfcc15m':
+            {'name': 'open_clip/RN50-quickgelu/yfcc15m',
+             'dimensions': 1024,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'yfcc15m'},
+
+        'open_clip/RN50-quickgelu/cc12m':
+            {'name': 'open_clip/RN50-quickgelu/cc12m',
+             'dimensions': 1024,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'cc12m'},
+
+        'open_clip/RN101/openai':
+            {'name': 'open_clip/RN101/openai',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'openai'},
+
+        'open_clip/RN101/yfcc15m':
+            {'name': 'open_clip/RN101/yfcc15m',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'yfcc15m'},
+
+        'open_clip/RN101-quickgelu/openai':
+            {'name': 'open_clip/RN101-quickgelu/openai',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'openai'},
+
+        'open_clip/RN101-quickgelu/yfcc15m':
+            {'name': 'open_clip/RN101-quickgelu/yfcc15m',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'yfcc15m'},
+
+        'open_clip/RN50x4/openai':
+            {'name': 'open_clip/RN50x4/openai',
+             'dimensions': 640,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'openai'},
+
+        'open_clip/RN50x16/openai':
+            {'name': 'open_clip/RN50x16/openai',
+             'dimensions': 768,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'openai'},
+
+        'open_clip/RN50x64/openai':
+            {'name': 'open_clip/RN50x64/openai',
+             'dimensions': 1024,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'openai'},
+
+        'open_clip/ViT-B-32/openai':
+            {'name': 'open_clip/ViT-B-32/openai',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'openai'},
+
+        'open_clip/ViT-B-32/laion400m_e31':
+            {'name': 'open_clip/ViT-B-32/laion400m_e31',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e31'},
+
+        'open_clip/ViT-B-32/laion400m_e32':
+            {'name': 'open_clip/ViT-B-32/laion400m_e32',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e32'},
+
+        'open_clip/ViT-B-32/laion2b_e16':
+            {'name': 'open_clip/ViT-B-32/laion2b_e16',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'laion2b_e16'},
+
+        'open_clip/ViT-B-32/laion2b_s34b_b79k':
+            {'name': 'open_clip/ViT-B-32/laion2b_s34b_b79k',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'laion2b_s34b_b79k'},
+
+        'open_clip/ViT-B-32-quickgelu/openai':
+            {'name': 'open_clip/ViT-B-32-quickgelu/openai',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'openai'},
+
+        'open_clip/ViT-B-32-quickgelu/laion400m_e31':
+            {'name': 'open_clip/ViT-B-32-quickgelu/laion400m_e31',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'laion400m_e31'},
+
+        'open_clip/ViT-B-32-quickgelu/laion400m_e32':
+            {'name': 'open_clip/ViT-B-32-quickgelu/laion400m_e32',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e32'},
+
+        'open_clip/ViT-B-16/openai':
+            {'name': 'open_clip/ViT-B-16/openai',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'openai'},
+
+        'open_clip/ViT-B-16/laion400m_e31':
+            {'name': 'open_clip/ViT-B-16/laion400m_e31',
+             'dimensions': 512,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e31'},
+
+        'open_clip/ViT-B-16/laion400m_e32':
+            {'name': 'open_clip/ViT-B-16/laion400m_e32',
+             'dimensions': 512, 'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e32'},
+
+        'open_clip/ViT-B-16-plus-240/laion400m_e31':
+            {'name': 'open_clip/ViT-B-16-plus-240/laion400m_e31',
+             'dimensions': 640,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e31'},
+
+        'open_clip/ViT-B-16-plus-240/laion400m_e32':
+            {'name': 'open_clip/ViT-B-16-plus-240/laion400m_e32',
+             'dimensions': 640,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e32'},
+
+        'open_clip/ViT-L-14/openai':
+            {'name': 'open_clip/ViT-L-14/openai',
+             'dimensions': 768,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'openai'},
+
+        'open_clip/ViT-L-14/laion400m_e31':
+            {'name': 'open_clip/ViT-L-14/laion400m_e31',
+             'dimensions': 768,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip', 'pretrained': 'laion400m_e31'},
+
+        'open_clip/ViT-L-14/laion400m_e32':
+            {'name': 'open_clip/ViT-L-14/laion400m_e32',
+             'dimensions': 768,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'laion400m_e32'},
+
+        'open_clip/ViT-L-14/laion2b_s32b_b82k':
+            {'name': 'open_clip/ViT-L-14/laion2b_s32b_b82k',
+             'dimensions': 768,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'laion2b_s32b_b82k'},
+
+        'open_clip/ViT-L-14-336/openai':
+            {'name': 'open_clip/ViT-L-14-336/openai',
+             'dimensions': 768,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'openai'},
+
+        'open_clip/ViT-H-14/laion2b_s32b_b79k':
+            {'name': 'open_clip/ViT-H-14/laion2b_s32b_b79k',
+             'dimensions': 1024,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'laion2b_s32b_b79k'},
+
+        'open_clip/ViT-g-14/laion2b_s12b_b42k':
+            {'name': 'open_clip/ViT-g-14/laion2b_s12b_b42k',
+             'dimensions': 1024,
+             'note': 'clip model from open_clip implementation',
+             'type': 'open_clip',
+             'pretrained': 'laion2b_s12b_b42k'}}
+
+    return OPEN_CLIP_MODEL_PROPERTIES
 
 
 def _get_sbert_properties() -> Dict:
@@ -322,7 +542,8 @@ def _get_random_properties() -> Dict:
     return RANDOM_MODEL_PROPERTIES
 
 def _get_model_load_mappings() -> Dict:
-    return {'clip':CLIP, 
+    return {'clip':CLIP,
+            'open_clip': OPEN_CLIP,
             'sbert':SBERT, 
             'test':TEST, 
             'sbert_onnx':SBERT_ONNX,
@@ -340,6 +561,7 @@ def load_model_properties() -> Dict:
     test_model_properties = _get_sbert_test_properties()
     random_model_properties = _get_random_properties()
     hf_model_properties = _get_hf_properties()
+    open_clip_model_properties = _get_open_clip_properties()
 
     # combine the above dicts
     model_properties = dict(clip_model_properties.items())
@@ -348,6 +570,7 @@ def load_model_properties() -> Dict:
     model_properties.update(sbert_onnx_model_properties)
     model_properties.update(random_model_properties)
     model_properties.update(hf_model_properties)
+    model_properties.update(open_clip_model_properties)
 
     all_properties = dict()
     all_properties['models'] = model_properties

diff --git a/src/marqo/s2_inference/s2_inference.py b/src/marqo/s2_inference/s2_inference.py
@@ -27,6 +27,7 @@ def vectorise(model_name: str, content: Union[str, List[str]], device: str = get
     Raises:
         VectoriseError: if the content can't be vectorised, for some reason.
     """
+
     model_cache_key = _create_model_cache_key(model_name, device)
 
     if model_cache_key not in available_models:
-Original file line number
+Diff line change
@@ Expand Up @@
         Raises:
             VectoriseError: if the content can't be vectorised, for some reason.
         """
         model_cache_key = _create_model_cache_key(model_name, device)
         if model_cache_key not in available_models:
@@ Expand Down @@