From 8eb24ddc424f2f4b8185bb80c27741d12d84753f Mon Sep 17 00:00:00 2001
From: Romain Beaumont <romain.rom1@gmail.com>
Date: Sun, 21 Jan 2024 23:04:17 +0100
Subject: [PATCH] Move support of model types to all_clip module.

I created the all_clip module in order to have a single place to support all kind of clip models.
It is already used in clip retrieval and I propose to use it here too.
---
 README.md                              | 12 +-----
 clip_benchmark/models/__init__.py      | 20 +++++-----
 clip_benchmark/models/japanese_clip.py | 54 --------------------------
 clip_benchmark/models/open_clip.py     |  8 ----
 requirements.txt                       |  1 +
 tests/test_clip_benchmark.py           | 20 +++++++++-
 6 files changed, 32 insertions(+), 83 deletions(-)
 delete mode 100644 clip_benchmark/models/japanese_clip.py
 delete mode 100644 clip_benchmark/models/open_clip.py

diff --git a/README.md b/README.md
index 4175571..ec13c6c 100644
--- a/README.md
+++ b/README.md
@@ -80,16 +80,8 @@ Here is an example of use
 
 ### How to add other CLIP models
 
-Please follow these steps:
-1. Add a identity file to load model in `clip_benchmark/models`
-2. Define a loading function, that returns a tuple (model, transform, tokenizer). Please see `clip_benchmark/models/open_clip.py` as an example. 
-3. Add the function into `TYPE2FUNC` in `clip_benchmark/models/__init__.py`
-
-Remarks:
-- The new tokenizer/model must enable to do the following things as https://github.com/openai/CLIP#usage
-  - `tokenizer(texts).to(device)`  ... `texts` is a list of string
-  - `model.encode_text(tokenized_texts)` ... `tokenized_texts` is a output from `tokenizer(texts).to(device)`
-  - `model.encode_image(images)` ... `images` is a image tensor by the `transform`
+Please add your model into [all-clip](https://github.com/data2ml/all-clip) and it will be supported into CLIP-benchmark (and in clip-retrieval).
+See [How to add a model type](https://github.com/data2ml/all-clip?tab=readme-ov-file#how-to-add-a-model-type)
 
 
 ### CIFAR-10 example
diff --git a/clip_benchmark/models/__init__.py b/clip_benchmark/models/__init__.py
index 96a6129..ef4c6f0 100644
--- a/clip_benchmark/models/__init__.py
+++ b/clip_benchmark/models/__init__.py
@@ -1,14 +1,9 @@
 from typing import Union
 import torch
-from .open_clip import load_open_clip
-from .japanese_clip import load_japanese_clip
+import all_clip
 
-# loading function must return (model, transform, tokenizer)
-TYPE2FUNC = {
-    "open_clip": load_open_clip,
-    "ja_clip": load_japanese_clip
-}
-MODEL_TYPES = list(TYPE2FUNC.keys())
+ # see https://github.com/rom1504/all-clip?tab=readme-ov-file#supported-models
+MODEL_TYPES = ["openai_clip", "open_clip", "ja_clip", "hf_clip", "nm"]
 
 
 def load_clip(
@@ -19,5 +14,10 @@ def load_clip(
         device: Union[str, torch.device] = "cuda"
 ):
     assert model_type in MODEL_TYPES, f"model_type={model_type} is invalid!"
-    load_func = TYPE2FUNC[model_type]
-    return load_func(model_name=model_name, pretrained=pretrained, cache_dir=cache_dir, device=device)
+    return all_clip.load_clip(
+        clip_model=model_type+":"+model_name+"/"+pretrained,
+        use_jit=True,
+        warmup_batch_size=1,
+        clip_cache_path=cache_dir,
+        device=device,
+    )
diff --git a/clip_benchmark/models/japanese_clip.py b/clip_benchmark/models/japanese_clip.py
deleted file mode 100644
index 6dee3a1..0000000
--- a/clip_benchmark/models/japanese_clip.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from typing import Dict
-import torch
-
-
-class DictTensor:
-    """
-    enable to do `tokenizer(texts).to(device)`
-    """
-    def __init__(self, d: Dict[str, torch.Tensor]):
-        self.d = d
-
-    def to(self, device):
-        return {k: v.to(device) for k, v in self.d.items()}
-
-
-class JaCLIPForBenchmark:
-    """
-    enable to do model.encode_text(dict_tensor)
-    """
-    def __init__(self, model):
-        self.model = model
-
-    def encode_text(self, dict_tensor):
-        return self.model.get_text_features(**dict_tensor)
-
-    def encode_image(self, image):
-        return self.model.get_image_features(image)
-
-
-def load_japanese_clip(pretrained: str, device="cpu", **kwargs):
-    """
-    Load Japanese CLIP/CLOOB by rinna (https://github.com/rinnakk/japanese-clip)
-    Remarks:
-     - You must input not only input_ids but also attention_masks and position_ids when doing `model.encode_text()` to make it work correctly.
-    """
-    try:
-        import japanese_clip as ja_clip
-    except ImportError:
-        raise ImportError("Install `japanese_clip` by `pip install git+https://github.com/rinnakk/japanese-clip.git`")
-    cache_dir = kwargs.pop("cache_dir", None)
-    model, transform = ja_clip.load(pretrained, device=device, cache_dir=cache_dir)
-
-    class JaTokenizerForBenchmark:
-        def __init__(self, ):
-            self.tokenizer = ja_clip.load_tokenizer()
-
-        def __call__(self, texts) -> Dict[str, torch.Tensor]:
-            inputs = ja_clip.tokenize(texts, tokenizer=self.tokenizer, device="cpu")
-            return DictTensor(inputs)
-
-        def __len__(self):
-            return len(self.tokenizer)
-
-    return JaCLIPForBenchmark(model), transform, JaTokenizerForBenchmark()
diff --git a/clip_benchmark/models/open_clip.py b/clip_benchmark/models/open_clip.py
deleted file mode 100644
index 610c603..0000000
--- a/clip_benchmark/models/open_clip.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import open_clip
-
-
-def load_open_clip(model_name: str = "ViT-B-32-quickgelu", pretrained: str = "laion400m_e32", cache_dir: str = None, device="cpu"):
-    model, _, transform = open_clip.create_model_and_transforms(model_name, pretrained=pretrained, cache_dir=cache_dir)
-    model = model.to(device)
-    tokenizer = open_clip.get_tokenizer(model_name)
-    return model, transform, tokenizer
diff --git a/requirements.txt b/requirements.txt
index 78ba769..6036e16 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ open_clip_torch>=0.2.1
 pycocoevalcap
 webdataset>=0.2.31
 transformers
+all_clip>=1.0.0,<2
\ No newline at end of file
diff --git a/tests/test_clip_benchmark.py b/tests/test_clip_benchmark.py
index 6914931..3ac7c8c 100644
--- a/tests/test_clip_benchmark.py
+++ b/tests/test_clip_benchmark.py
@@ -6,6 +6,7 @@
 from clip_benchmark.cli import run
 import logging
 import torch
+import pytest
 
 class base_args:
     dataset="dummy"
@@ -109,10 +110,27 @@ class linear_probe_args:
     custom_classname_file=None
     distributed=False
 
-def test_base():
+
+def test_linear_probe():
     if torch.cuda.is_available():
         run(linear_probe_args)
     else:
         logging.warning("GPU acceleration is required for linear evaluation to ensure optimal performance and efficiency.")
+
+
+@pytest.mark.parametrize(
+    "full_model_name",
+    [
+        "openai_clip:ViT-B/32",
+        "open_clip:ViT-B-32/laion2b_s34b_b79k",
+        "hf_clip:patrickjohncyh/fashion-clip",
+    ],
+)
+def test_base(full_model_name):
+    model_type, model_name = full_model_name.split(":")
+    model, pretrained = model_name.split("/")
+    base_args.model_type = model_type
+    base_args.model = model
+    base_args.pretrained = pretrained
     os.environ["CUDA_VISIBLE_DEVICES"] = ""
     run(base_args)