From 8eb24ddc424f2f4b8185bb80c27741d12d84753f Mon Sep 17 00:00:00 2001 From: Romain Beaumont Date: Sun, 21 Jan 2024 23:04:17 +0100 Subject: [PATCH] Move support of model types to all_clip module. I created the all_clip module in order to have a single place to support all kind of clip models. It is already used in clip retrieval and I propose to use it here too. --- README.md | 12 +----- clip_benchmark/models/__init__.py | 20 +++++----- clip_benchmark/models/japanese_clip.py | 54 -------------------------- clip_benchmark/models/open_clip.py | 8 ---- requirements.txt | 1 + tests/test_clip_benchmark.py | 20 +++++++++- 6 files changed, 32 insertions(+), 83 deletions(-) delete mode 100644 clip_benchmark/models/japanese_clip.py delete mode 100644 clip_benchmark/models/open_clip.py diff --git a/README.md b/README.md index 4175571..ec13c6c 100644 --- a/README.md +++ b/README.md @@ -80,16 +80,8 @@ Here is an example of use ### How to add other CLIP models -Please follow these steps: -1. Add a identity file to load model in `clip_benchmark/models` -2. Define a loading function, that returns a tuple (model, transform, tokenizer). Please see `clip_benchmark/models/open_clip.py` as an example. -3. Add the function into `TYPE2FUNC` in `clip_benchmark/models/__init__.py` - -Remarks: -- The new tokenizer/model must enable to do the following things as https://github.com/openai/CLIP#usage - - `tokenizer(texts).to(device)` ... `texts` is a list of string - - `model.encode_text(tokenized_texts)` ... `tokenized_texts` is a output from `tokenizer(texts).to(device)` - - `model.encode_image(images)` ... `images` is a image tensor by the `transform` +Please add your model into [all-clip](https://github.com/data2ml/all-clip) and it will be supported into CLIP-benchmark (and in clip-retrieval). +See [How to add a model type](https://github.com/data2ml/all-clip?tab=readme-ov-file#how-to-add-a-model-type) ### CIFAR-10 example diff --git a/clip_benchmark/models/__init__.py b/clip_benchmark/models/__init__.py index 96a6129..ef4c6f0 100644 --- a/clip_benchmark/models/__init__.py +++ b/clip_benchmark/models/__init__.py @@ -1,14 +1,9 @@ from typing import Union import torch -from .open_clip import load_open_clip -from .japanese_clip import load_japanese_clip +import all_clip -# loading function must return (model, transform, tokenizer) -TYPE2FUNC = { - "open_clip": load_open_clip, - "ja_clip": load_japanese_clip -} -MODEL_TYPES = list(TYPE2FUNC.keys()) + # see https://github.com/rom1504/all-clip?tab=readme-ov-file#supported-models +MODEL_TYPES = ["openai_clip", "open_clip", "ja_clip", "hf_clip", "nm"] def load_clip( @@ -19,5 +14,10 @@ def load_clip( device: Union[str, torch.device] = "cuda" ): assert model_type in MODEL_TYPES, f"model_type={model_type} is invalid!" - load_func = TYPE2FUNC[model_type] - return load_func(model_name=model_name, pretrained=pretrained, cache_dir=cache_dir, device=device) + return all_clip.load_clip( + clip_model=model_type+":"+model_name+"/"+pretrained, + use_jit=True, + warmup_batch_size=1, + clip_cache_path=cache_dir, + device=device, + ) diff --git a/clip_benchmark/models/japanese_clip.py b/clip_benchmark/models/japanese_clip.py deleted file mode 100644 index 6dee3a1..0000000 --- a/clip_benchmark/models/japanese_clip.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Dict -import torch - - -class DictTensor: - """ - enable to do `tokenizer(texts).to(device)` - """ - def __init__(self, d: Dict[str, torch.Tensor]): - self.d = d - - def to(self, device): - return {k: v.to(device) for k, v in self.d.items()} - - -class JaCLIPForBenchmark: - """ - enable to do model.encode_text(dict_tensor) - """ - def __init__(self, model): - self.model = model - - def encode_text(self, dict_tensor): - return self.model.get_text_features(**dict_tensor) - - def encode_image(self, image): - return self.model.get_image_features(image) - - -def load_japanese_clip(pretrained: str, device="cpu", **kwargs): - """ - Load Japanese CLIP/CLOOB by rinna (https://github.com/rinnakk/japanese-clip) - Remarks: - - You must input not only input_ids but also attention_masks and position_ids when doing `model.encode_text()` to make it work correctly. - """ - try: - import japanese_clip as ja_clip - except ImportError: - raise ImportError("Install `japanese_clip` by `pip install git+https://github.com/rinnakk/japanese-clip.git`") - cache_dir = kwargs.pop("cache_dir", None) - model, transform = ja_clip.load(pretrained, device=device, cache_dir=cache_dir) - - class JaTokenizerForBenchmark: - def __init__(self, ): - self.tokenizer = ja_clip.load_tokenizer() - - def __call__(self, texts) -> Dict[str, torch.Tensor]: - inputs = ja_clip.tokenize(texts, tokenizer=self.tokenizer, device="cpu") - return DictTensor(inputs) - - def __len__(self): - return len(self.tokenizer) - - return JaCLIPForBenchmark(model), transform, JaTokenizerForBenchmark() diff --git a/clip_benchmark/models/open_clip.py b/clip_benchmark/models/open_clip.py deleted file mode 100644 index 610c603..0000000 --- a/clip_benchmark/models/open_clip.py +++ /dev/null @@ -1,8 +0,0 @@ -import open_clip - - -def load_open_clip(model_name: str = "ViT-B-32-quickgelu", pretrained: str = "laion400m_e32", cache_dir: str = None, device="cpu"): - model, _, transform = open_clip.create_model_and_transforms(model_name, pretrained=pretrained, cache_dir=cache_dir) - model = model.to(device) - tokenizer = open_clip.get_tokenizer(model_name) - return model, transform, tokenizer diff --git a/requirements.txt b/requirements.txt index 78ba769..6036e16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ open_clip_torch>=0.2.1 pycocoevalcap webdataset>=0.2.31 transformers +all_clip>=1.0.0,<2 \ No newline at end of file diff --git a/tests/test_clip_benchmark.py b/tests/test_clip_benchmark.py index 6914931..3ac7c8c 100644 --- a/tests/test_clip_benchmark.py +++ b/tests/test_clip_benchmark.py @@ -6,6 +6,7 @@ from clip_benchmark.cli import run import logging import torch +import pytest class base_args: dataset="dummy" @@ -109,10 +110,27 @@ class linear_probe_args: custom_classname_file=None distributed=False -def test_base(): + +def test_linear_probe(): if torch.cuda.is_available(): run(linear_probe_args) else: logging.warning("GPU acceleration is required for linear evaluation to ensure optimal performance and efficiency.") + + +@pytest.mark.parametrize( + "full_model_name", + [ + "openai_clip:ViT-B/32", + "open_clip:ViT-B-32/laion2b_s34b_b79k", + "hf_clip:patrickjohncyh/fashion-clip", + ], +) +def test_base(full_model_name): + model_type, model_name = full_model_name.split(":") + model, pretrained = model_name.split("/") + base_args.model_type = model_type + base_args.model = model + base_args.pretrained = pretrained os.environ["CUDA_VISIBLE_DEVICES"] = "" run(base_args)