From 6288c96443b1ae430fa61e9861ade8f1714e192f Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 13 Apr 2024 16:42:47 -0400 Subject: [PATCH 01/13] fix class names ASAP!!! --- InstructorEmbedding/instructor.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py index 9d4f90a..b0a7674 100644 --- a/InstructorEmbedding/instructor.py +++ b/InstructorEmbedding/instructor.py @@ -23,7 +23,7 @@ def batch_to_device(batch, target_device: str): return batch -class InstructorPooling(nn.Module): +class INSTRUCTORPooling(nn.Module): """Performs pooling (max or mean) on the token embeddings. Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. @@ -245,7 +245,7 @@ def load(input_path): ) as config_file: config = json.load(config_file) - return InstructorPooling(**config) + return INSTRUCTORPooling(**config) def import_from_string(dotted_path): @@ -271,7 +271,7 @@ def import_from_string(dotted_path): raise ImportError(msg) -class InstructorTransformer(Transformer): +class INSTRUCTORTransformer(Transformer): def __init__( self, model_name_or_path: str, @@ -378,7 +378,7 @@ def load(input_path: str): with open(sbert_config_path, encoding="UTF-8") as config_file: config = json.load(config_file) - return InstructorTransformer(model_name_or_path=input_path, **config) + return INSTRUCTORTransformer(model_name_or_path=input_path, **config) def tokenize(self, texts): """ @@ -420,7 +420,7 @@ def tokenize(self, texts): input_features = self.tokenize(instruction_prepended_input_texts) instruction_features = self.tokenize(instructions) - input_features = Instructor.prepare_input_features( + input_features = INSTRUCTOR.prepare_input_features( input_features, instruction_features ) else: @@ -430,7 +430,7 @@ def tokenize(self, texts): return output -class Instructor(SentenceTransformer): +class INSTRUCTOR(SentenceTransformer): @staticmethod def prepare_input_features( input_features, instruction_features, return_data_type: str = "pt" @@ -510,7 +510,7 @@ def smart_batching_collate(self, batch): input_features = self.tokenize(instruction_prepended_input_texts) instruction_features = self.tokenize(instructions) - input_features = Instructor.prepare_input_features( + input_features = INSTRUCTOR.prepare_input_features( input_features, instruction_features ) batched_input_features.append(input_features) @@ -559,9 +559,9 @@ def _load_sbert_model(self, model_path, token = None, cache_folder = None, revis modules = OrderedDict() for module_config in modules_config: if module_config["idx"] == 0: - module_class = InstructorTransformer + module_class = INSTRUCTORTransformer elif module_config["idx"] == 1: - module_class = InstructorPooling + module_class = INSTRUCTORPooling else: module_class = import_from_string(module_config["type"]) module = module_class.load(os.path.join(model_path, module_config["path"])) @@ -686,4 +686,4 @@ def encode( if input_was_string: all_embeddings = all_embeddings[0] - return all_embeddings + return all_embeddings \ No newline at end of file From 30a598d3b4b52bcff8cbabf070c7ab1f276e8f71 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 13 Apr 2024 19:29:45 -0400 Subject: [PATCH 02/13] modify to fix not being able to use local model --- InstructorEmbedding/instructor.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py index b0a7674..2d9fb4a 100644 --- a/InstructorEmbedding/instructor.py +++ b/InstructorEmbedding/instructor.py @@ -517,20 +517,24 @@ def smart_batching_collate(self, batch): return batched_input_features, labels - def _load_sbert_model(self, model_path, token = None, cache_folder = None, revision = None, trust_remote_code = False): + def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision=None, trust_remote_code=False): """ Loads a full sentence-transformers model """ - # Taken mostly from: https://github.com/UKPLab/sentence-transformers/blob/66e0ee30843dd411c64f37f65447bb38c7bf857a/sentence_transformers/util.py#L544 - download_kwargs = { - "repo_id": model_path, - "revision": revision, - "library_name": "sentence-transformers", - "token": token, - "cache_dir": cache_folder, - "tqdm_class": disabled_tqdm, - } - model_path = snapshot_download(**download_kwargs) + if os.path.isdir(model_path): + # If model_path is a local directory, load the model directly + model_path = str(model_path) + else: + # If model_path is a Hugging Face repository ID, download the model + download_kwargs = { + "repo_id": model_path, + "revision": revision, + "library_name": "sentence-transformers", + "token": token, + "cache_dir": cache_folder, + "tqdm_class": disabled_tqdm, + } + model_path = snapshot_download(**download_kwargs) # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework) config_sentence_transformers_json_path = os.path.join( From 188ee783ddc603acfb2ffd59806fbacbb18195d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Ra=C4=8Dinsk=C3=BD?= Date: Mon, 15 Apr 2024 10:14:46 +0200 Subject: [PATCH 03/13] refactor: correct library name and logic for local files only, updated dependency to correspond to newer feature, update target name --- InstructorEmbedding/instructor.py | 18 +++++++++++++----- requirements.txt | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py index 2d9fb4a..eee1ac1 100644 --- a/InstructorEmbedding/instructor.py +++ b/InstructorEmbedding/instructor.py @@ -521,20 +521,28 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision= """ Loads a full sentence-transformers model """ + # copied from https://github.com/UKPLab/sentence-transformers/blob/66e0ee30843dd411c64f37f65447bb38c7bf857a/sentence_transformers/util.py#L559 + # because we need to get files outside of the allow_patterns too + # If file is local if os.path.isdir(model_path): - # If model_path is a local directory, load the model directly model_path = str(model_path) else: # If model_path is a Hugging Face repository ID, download the model download_kwargs = { "repo_id": model_path, "revision": revision, - "library_name": "sentence-transformers", + "library_name": "InstructorEmbedding", "token": token, "cache_dir": cache_folder, "tqdm_class": disabled_tqdm, } - model_path = snapshot_download(**download_kwargs) + # Try to download from the remote + try: + model_path = snapshot_download(**download_kwargs) + except Exception: + # Otherwise, try local (i.e. cache) only + download_kwargs["local_files_only"] = True + model_path = snapshot_download(**download_kwargs) # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework) config_sentence_transformers_json_path = os.path.join( @@ -623,7 +631,7 @@ def encode( input_was_string = True if device is None: - device = self._target_device + device = self.device self.to(device) @@ -690,4 +698,4 @@ def encode( if input_was_string: all_embeddings = all_embeddings[0] - return all_embeddings \ No newline at end of file + return all_embeddings diff --git a/requirements.txt b/requirements.txt index a7fe466..410b6c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy requests>=2.26.0 scikit_learn>=1.0.2 scipy -sentence_transformers>=2.2.0 +sentence_transformers>=2.3.0 torch tqdm rich From 56c7449594ebd576d620bdadcf452bfd87e1cd08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Ra=C4=8Dinsk=C3=BD?= Date: Tue, 16 Apr 2024 23:06:53 +0200 Subject: [PATCH 04/13] refactor: returning the _ to the name --- InstructorEmbedding/instructor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py index eee1ac1..0631902 100644 --- a/InstructorEmbedding/instructor.py +++ b/InstructorEmbedding/instructor.py @@ -23,7 +23,7 @@ def batch_to_device(batch, target_device: str): return batch -class INSTRUCTORPooling(nn.Module): +class INSTRUCTOR_Pooling(nn.Module): """Performs pooling (max or mean) on the token embeddings. Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. @@ -245,7 +245,7 @@ def load(input_path): ) as config_file: config = json.load(config_file) - return INSTRUCTORPooling(**config) + return INSTRUCTOR_Pooling(**config) def import_from_string(dotted_path): @@ -573,7 +573,7 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision= if module_config["idx"] == 0: module_class = INSTRUCTORTransformer elif module_config["idx"] == 1: - module_class = INSTRUCTORPooling + module_class = INSTRUCTOR_Pooling else: module_class = import_from_string(module_config["type"]) module = module_class.load(os.path.join(model_path, module_config["path"])) From 1c0286ed3099d9193dedd5198b5758391a436e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Ra=C4=8Dinsk=C3=BD?= Date: Wed, 24 Apr 2024 12:00:55 +0200 Subject: [PATCH 05/13] chore: increased version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 311e732..6f17fb0 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='InstructorEmbedding', packages=['InstructorEmbedding'], - version='1.0.1', + version='1.0.2', license='Apache License 2.0', description='Text embedding tool', long_description=readme, From 0e14f1223a041e0a6e58b2669957393d2b6307ef Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 16:26:33 -0400 Subject: [PATCH 06/13] Update README.md --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index ddd0502..4288d09 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,14 @@ +## My Personal Fork + +This is a fork for the Instructor model becuase the original repository isn't kept up anymore. I've also made some improvements to their source code: + +1) Fixing it to work with versions of the ```sentence-transformers``` library after version 2.2.2 + > The original only supports version 2.2.2. +2) Properly download the models from huggingface. +3) Ability to use the model downloaded to a folder on your computer - not ALWAYS having to download from Huggingface. + +## What follows is the original repository's readme file. Ignore the quantization section, however, becuase pytorch has changed its API since then. + # One Embedder, Any Task: Instruction-Finetuned Text Embeddings This repository contains the code and pre-trained models for our paper [One Embedder, Any Task: Instruction-Finetuned Text Embeddings](https://arxiv.org/abs/2212.09741). Please refer to our [project page](https://instructor-embedding.github.io/) for a quick project overview. From f7d69145d2e96ab95ff4f0e9db9e0284b119ca90 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 16:29:34 -0400 Subject: [PATCH 07/13] Update README.md --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index ddd0502..4288d09 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,14 @@ +## My Personal Fork + +This is a fork for the Instructor model becuase the original repository isn't kept up anymore. I've also made some improvements to their source code: + +1) Fixing it to work with versions of the ```sentence-transformers``` library after version 2.2.2 + > The original only supports version 2.2.2. +2) Properly download the models from huggingface. +3) Ability to use the model downloaded to a folder on your computer - not ALWAYS having to download from Huggingface. + +## What follows is the original repository's readme file. Ignore the quantization section, however, becuase pytorch has changed its API since then. + # One Embedder, Any Task: Instruction-Finetuned Text Embeddings This repository contains the code and pre-trained models for our paper [One Embedder, Any Task: Instruction-Finetuned Text Embeddings](https://arxiv.org/abs/2212.09741). Please refer to our [project page](https://instructor-embedding.github.io/) for a quick project overview. From 9b8bb5889c21b6a2fdfd4fbb6e3936a39e44e328 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 17:41:46 -0400 Subject: [PATCH 08/13] Update instructor.py Thanks for @racinmat for reminding me to change this. --- InstructorEmbedding/instructor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py index 2d9fb4a..f6beee0 100644 --- a/InstructorEmbedding/instructor.py +++ b/InstructorEmbedding/instructor.py @@ -623,7 +623,7 @@ def encode( input_was_string = True if device is None: - device = self._target_device + device = self.device self.to(device) @@ -690,4 +690,4 @@ def encode( if input_was_string: all_embeddings = all_embeddings[0] - return all_embeddings \ No newline at end of file + return all_embeddings From a4f1dec3d30c7102b8d64cedf0d26352908d74e5 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 17:43:15 -0400 Subject: [PATCH 09/13] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a7fe466..8dbe7cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ numpy requests>=2.26.0 scikit_learn>=1.0.2 scipy -sentence_transformers>=2.2.0 +sentence_transformers>=3.0.1 torch tqdm rich From 9c820eca44fac22c9a6470de88521761f7430048 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 18:03:16 -0400 Subject: [PATCH 10/13] Update requirements.txt --- requirements.txt | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8dbe7cc..2d3749d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,12 @@ -transformers==4.20.0 -datasets>=2.2.0 -pyarrow==8.0.0 -jsonlines -numpy -requests>=2.26.0 +transformers>=4.20,<5.0 +datasets>=2.20,<3.0 +pyarrow>=17.0,<18.0 +numpy>=1.0,<=1.26.4 +requests>=2.26,<3.0 scikit_learn>=1.0.2 -scipy -sentence_transformers>=3.0.1 -torch -tqdm -rich -tensorboard -huggingface-hub>=0.19.0 +scipy>=1.14,<2.0 +sentence-transformers>=3.0.1,<4.0 +torch>=2.0 +tqdm>=4.0,<5.0 +rich>=13.0,<14.0 +huggingface-hub==0.24.1 From 79de7a6b22b93f1998552bdadd7739e6300698a8 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 18:05:20 -0400 Subject: [PATCH 11/13] Update requirements.txt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2d3749d..55652ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,10 +3,10 @@ datasets>=2.20,<3.0 pyarrow>=17.0,<18.0 numpy>=1.0,<=1.26.4 requests>=2.26,<3.0 -scikit_learn>=1.0.2 +scikit_learn>=1.0.2,<2.0 scipy>=1.14,<2.0 sentence-transformers>=3.0.1,<4.0 torch>=2.0 tqdm>=4.0,<5.0 rich>=13.0,<14.0 -huggingface-hub==0.24.1 +huggingface-hub>=0.24.1 From 882336b449c5df41cd84063b3daf5b563aa4d94f Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 18:09:58 -0400 Subject: [PATCH 12/13] Update README.md --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4288d09..8e3d3b3 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,9 @@ This is a fork for the Instructor model becuase the original repository isn't kept up anymore. I've also made some improvements to their source code: -1) Fixing it to work with versions of the ```sentence-transformers``` library after version 2.2.2 - > The original only supports version 2.2.2. -2) Properly download the models from huggingface. -3) Ability to use the model downloaded to a folder on your computer - not ALWAYS having to download from Huggingface. +1) Fixing it to work with the ```sentence-transformers``` library above 2.2.2. +2) Properly download the models from huggingface using the new "snapshot download" API. +3) Ability to specify where you want the model donwloaded with the "cache_dir" parameter. ## What follows is the original repository's readme file. Ignore the quantization section, however, becuase pytorch has changed its API since then. From ae975c402e255e21e33339b242d3ebcd46cd5fa5 Mon Sep 17 00:00:00 2001 From: BBC-Esq Date: Sat, 24 Aug 2024 18:34:01 -0400 Subject: [PATCH 13/13] Update instructor.py --- InstructorEmbedding/instructor.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py index 0631902..72b3df2 100644 --- a/InstructorEmbedding/instructor.py +++ b/InstructorEmbedding/instructor.py @@ -23,7 +23,7 @@ def batch_to_device(batch, target_device: str): return batch -class INSTRUCTOR_Pooling(nn.Module): +class INSTRUCTORPooling(nn.Module): """Performs pooling (max or mean) on the token embeddings. Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. @@ -245,7 +245,7 @@ def load(input_path): ) as config_file: config = json.load(config_file) - return INSTRUCTOR_Pooling(**config) + return INSTRUCTORPooling(**config) def import_from_string(dotted_path): @@ -536,13 +536,6 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision= "cache_dir": cache_folder, "tqdm_class": disabled_tqdm, } - # Try to download from the remote - try: - model_path = snapshot_download(**download_kwargs) - except Exception: - # Otherwise, try local (i.e. cache) only - download_kwargs["local_files_only"] = True - model_path = snapshot_download(**download_kwargs) # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework) config_sentence_transformers_json_path = os.path.join( @@ -573,7 +566,7 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision= if module_config["idx"] == 0: module_class = INSTRUCTORTransformer elif module_config["idx"] == 1: - module_class = INSTRUCTOR_Pooling + module_class = INSTRUCTORPooling else: module_class = import_from_string(module_config["type"]) module = module_class.load(os.path.join(model_path, module_config["path"]))