From 6288c96443b1ae430fa61e9861ade8f1714e192f Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 13 Apr 2024 16:42:47 -0400
Subject: [PATCH 01/13] fix class names ASAP!!!

---
 InstructorEmbedding/instructor.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py
index 9d4f90a..b0a7674 100644
--- a/InstructorEmbedding/instructor.py
+++ b/InstructorEmbedding/instructor.py
@@ -23,7 +23,7 @@ def batch_to_device(batch, target_device: str):
     return batch
 
 
-class InstructorPooling(nn.Module):
+class INSTRUCTORPooling(nn.Module):
     """Performs pooling (max or mean) on the token embeddings.
 
     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding.
@@ -245,7 +245,7 @@ def load(input_path):
         ) as config_file:
             config = json.load(config_file)
 
-        return InstructorPooling(**config)
+        return INSTRUCTORPooling(**config)
 
 
 def import_from_string(dotted_path):
@@ -271,7 +271,7 @@ def import_from_string(dotted_path):
         raise ImportError(msg)
 
 
-class InstructorTransformer(Transformer):
+class INSTRUCTORTransformer(Transformer):
     def __init__(
         self,
         model_name_or_path: str,
@@ -378,7 +378,7 @@ def load(input_path: str):
 
         with open(sbert_config_path, encoding="UTF-8") as config_file:
             config = json.load(config_file)
-        return InstructorTransformer(model_name_or_path=input_path, **config)
+        return INSTRUCTORTransformer(model_name_or_path=input_path, **config)
 
     def tokenize(self, texts):
         """
@@ -420,7 +420,7 @@ def tokenize(self, texts):
 
             input_features = self.tokenize(instruction_prepended_input_texts)
             instruction_features = self.tokenize(instructions)
-            input_features = Instructor.prepare_input_features(
+            input_features = INSTRUCTOR.prepare_input_features(
                 input_features, instruction_features
             )
         else:
@@ -430,7 +430,7 @@ def tokenize(self, texts):
         return output
 
 
-class Instructor(SentenceTransformer):
+class INSTRUCTOR(SentenceTransformer):
     @staticmethod
     def prepare_input_features(
         input_features, instruction_features, return_data_type: str = "pt"
@@ -510,7 +510,7 @@ def smart_batching_collate(self, batch):
 
             input_features = self.tokenize(instruction_prepended_input_texts)
             instruction_features = self.tokenize(instructions)
-            input_features = Instructor.prepare_input_features(
+            input_features = INSTRUCTOR.prepare_input_features(
                 input_features, instruction_features
             )
             batched_input_features.append(input_features)
@@ -559,9 +559,9 @@ def _load_sbert_model(self, model_path, token = None, cache_folder = None, revis
         modules = OrderedDict()
         for module_config in modules_config:
             if module_config["idx"] == 0:
-                module_class = InstructorTransformer
+                module_class = INSTRUCTORTransformer
             elif module_config["idx"] == 1:
-                module_class = InstructorPooling
+                module_class = INSTRUCTORPooling
             else:
                 module_class = import_from_string(module_config["type"])
             module = module_class.load(os.path.join(model_path, module_config["path"]))
@@ -686,4 +686,4 @@ def encode(
         if input_was_string:
             all_embeddings = all_embeddings[0]
 
-        return all_embeddings
+        return all_embeddings
\ No newline at end of file

From 30a598d3b4b52bcff8cbabf070c7ab1f276e8f71 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 13 Apr 2024 19:29:45 -0400
Subject: [PATCH 02/13] modify to fix not being able to use local model

---
 InstructorEmbedding/instructor.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py
index b0a7674..2d9fb4a 100644
--- a/InstructorEmbedding/instructor.py
+++ b/InstructorEmbedding/instructor.py
@@ -517,20 +517,24 @@ def smart_batching_collate(self, batch):
 
         return batched_input_features, labels
 
-    def _load_sbert_model(self, model_path, token = None, cache_folder = None, revision = None, trust_remote_code = False):
+    def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision=None, trust_remote_code=False):
         """
         Loads a full sentence-transformers model
         """
-        # Taken mostly from: https://github.com/UKPLab/sentence-transformers/blob/66e0ee30843dd411c64f37f65447bb38c7bf857a/sentence_transformers/util.py#L544
-        download_kwargs = {
-            "repo_id": model_path,
-            "revision": revision,
-            "library_name": "sentence-transformers",
-            "token": token,
-            "cache_dir": cache_folder,
-            "tqdm_class": disabled_tqdm,
-        }
-        model_path = snapshot_download(**download_kwargs)
+        if os.path.isdir(model_path):
+            # If model_path is a local directory, load the model directly
+            model_path = str(model_path)
+        else:
+            # If model_path is a Hugging Face repository ID, download the model
+            download_kwargs = {
+                "repo_id": model_path,
+                "revision": revision,
+                "library_name": "sentence-transformers",
+                "token": token,
+                "cache_dir": cache_folder,
+                "tqdm_class": disabled_tqdm,
+            }
+            model_path = snapshot_download(**download_kwargs)
 
         # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework)
         config_sentence_transformers_json_path = os.path.join(

From 188ee783ddc603acfb2ffd59806fbacbb18195d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Ra=C4=8Dinsk=C3=BD?= <racinsky.matej@seznam.cz>
Date: Mon, 15 Apr 2024 10:14:46 +0200
Subject: [PATCH 03/13] refactor: correct library name and logic for local
 files only, updated dependency to correspond to newer feature, update target
 name

---
 InstructorEmbedding/instructor.py | 18 +++++++++++++-----
 requirements.txt                  |  2 +-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py
index 2d9fb4a..eee1ac1 100644
--- a/InstructorEmbedding/instructor.py
+++ b/InstructorEmbedding/instructor.py
@@ -521,20 +521,28 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision=
         """
         Loads a full sentence-transformers model
         """
+        # copied from https://github.com/UKPLab/sentence-transformers/blob/66e0ee30843dd411c64f37f65447bb38c7bf857a/sentence_transformers/util.py#L559
+        # because we need to get files outside of the allow_patterns too
+        # If file is local
         if os.path.isdir(model_path):
-            # If model_path is a local directory, load the model directly
             model_path = str(model_path)
         else:
             # If model_path is a Hugging Face repository ID, download the model
             download_kwargs = {
                 "repo_id": model_path,
                 "revision": revision,
-                "library_name": "sentence-transformers",
+                "library_name": "InstructorEmbedding",
                 "token": token,
                 "cache_dir": cache_folder,
                 "tqdm_class": disabled_tqdm,
             }
-            model_path = snapshot_download(**download_kwargs)
+            # Try to download from the remote
+            try:
+                model_path = snapshot_download(**download_kwargs)
+            except Exception:
+                # Otherwise, try local (i.e. cache) only
+                download_kwargs["local_files_only"] = True
+                model_path = snapshot_download(**download_kwargs)
 
         # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework)
         config_sentence_transformers_json_path = os.path.join(
@@ -623,7 +631,7 @@ def encode(
             input_was_string = True
 
         if device is None:
-            device = self._target_device
+            device = self.device
 
         self.to(device)
 
@@ -690,4 +698,4 @@ def encode(
         if input_was_string:
             all_embeddings = all_embeddings[0]
 
-        return all_embeddings
\ No newline at end of file
+        return all_embeddings
diff --git a/requirements.txt b/requirements.txt
index a7fe466..410b6c3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ numpy
 requests>=2.26.0
 scikit_learn>=1.0.2
 scipy
-sentence_transformers>=2.2.0
+sentence_transformers>=2.3.0
 torch
 tqdm
 rich

From 56c7449594ebd576d620bdadcf452bfd87e1cd08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Ra=C4=8Dinsk=C3=BD?= <racinsky.matej@seznam.cz>
Date: Tue, 16 Apr 2024 23:06:53 +0200
Subject: [PATCH 04/13] refactor: returning the _ to the name

---
 InstructorEmbedding/instructor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py
index eee1ac1..0631902 100644
--- a/InstructorEmbedding/instructor.py
+++ b/InstructorEmbedding/instructor.py
@@ -23,7 +23,7 @@ def batch_to_device(batch, target_device: str):
     return batch
 
 
-class INSTRUCTORPooling(nn.Module):
+class INSTRUCTOR_Pooling(nn.Module):
     """Performs pooling (max or mean) on the token embeddings.
 
     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding.
@@ -245,7 +245,7 @@ def load(input_path):
         ) as config_file:
             config = json.load(config_file)
 
-        return INSTRUCTORPooling(**config)
+        return INSTRUCTOR_Pooling(**config)
 
 
 def import_from_string(dotted_path):
@@ -573,7 +573,7 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision=
             if module_config["idx"] == 0:
                 module_class = INSTRUCTORTransformer
             elif module_config["idx"] == 1:
-                module_class = INSTRUCTORPooling
+                module_class = INSTRUCTOR_Pooling
             else:
                 module_class = import_from_string(module_config["type"])
             module = module_class.load(os.path.join(model_path, module_config["path"]))

From 1c0286ed3099d9193dedd5198b5758391a436e3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Ra=C4=8Dinsk=C3=BD?= <racinsky.matej@seznam.cz>
Date: Wed, 24 Apr 2024 12:00:55 +0200
Subject: [PATCH 05/13] chore: increased version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 311e732..6f17fb0 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 setup(
     name='InstructorEmbedding',
     packages=['InstructorEmbedding'],
-    version='1.0.1',
+    version='1.0.2',
     license='Apache License 2.0',
     description='Text embedding tool',
     long_description=readme,

From 0e14f1223a041e0a6e58b2669957393d2b6307ef Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 16:26:33 -0400
Subject: [PATCH 06/13] Update README.md

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index ddd0502..4288d09 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,14 @@
+## My Personal Fork
+
+This is a fork for the Instructor model becuase the original repository isn't kept up anymore.  I've also made some improvements to their source code:
+
+1) Fixing it to work with versions of the ```sentence-transformers``` library after version 2.2.2
+   > The original only supports version 2.2.2.
+2) Properly download the models from huggingface.
+3) Ability to use the model downloaded to a folder on your computer - not ALWAYS having to download from Huggingface.
+
+## What follows is the original repository's readme file.  Ignore the quantization section, however, becuase pytorch has changed its API since then.
+
 # One Embedder, Any Task: Instruction-Finetuned Text Embeddings
 
 This repository contains the code and pre-trained models for our paper [One Embedder, Any Task: Instruction-Finetuned Text Embeddings](https://arxiv.org/abs/2212.09741). Please refer to our [project page](https://instructor-embedding.github.io/) for a quick project overview.

From f7d69145d2e96ab95ff4f0e9db9e0284b119ca90 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 16:29:34 -0400
Subject: [PATCH 07/13] Update README.md

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index ddd0502..4288d09 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,14 @@
+## My Personal Fork
+
+This is a fork for the Instructor model becuase the original repository isn't kept up anymore.  I've also made some improvements to their source code:
+
+1) Fixing it to work with versions of the ```sentence-transformers``` library after version 2.2.2
+   > The original only supports version 2.2.2.
+2) Properly download the models from huggingface.
+3) Ability to use the model downloaded to a folder on your computer - not ALWAYS having to download from Huggingface.
+
+## What follows is the original repository's readme file.  Ignore the quantization section, however, becuase pytorch has changed its API since then.
+
 # One Embedder, Any Task: Instruction-Finetuned Text Embeddings
 
 This repository contains the code and pre-trained models for our paper [One Embedder, Any Task: Instruction-Finetuned Text Embeddings](https://arxiv.org/abs/2212.09741). Please refer to our [project page](https://instructor-embedding.github.io/) for a quick project overview.

From 9b8bb5889c21b6a2fdfd4fbb6e3936a39e44e328 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 17:41:46 -0400
Subject: [PATCH 08/13] Update instructor.py

Thanks for @racinmat for reminding me to change this.
---
 InstructorEmbedding/instructor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py
index 2d9fb4a..f6beee0 100644
--- a/InstructorEmbedding/instructor.py
+++ b/InstructorEmbedding/instructor.py
@@ -623,7 +623,7 @@ def encode(
             input_was_string = True
 
         if device is None:
-            device = self._target_device
+            device = self.device
 
         self.to(device)
 
@@ -690,4 +690,4 @@ def encode(
         if input_was_string:
             all_embeddings = all_embeddings[0]
 
-        return all_embeddings
\ No newline at end of file
+        return all_embeddings

From a4f1dec3d30c7102b8d64cedf0d26352908d74e5 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 17:43:15 -0400
Subject: [PATCH 09/13] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a7fe466..8dbe7cc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ numpy
 requests>=2.26.0
 scikit_learn>=1.0.2
 scipy
-sentence_transformers>=2.2.0
+sentence_transformers>=3.0.1
 torch
 tqdm
 rich

From 9c820eca44fac22c9a6470de88521761f7430048 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 18:03:16 -0400
Subject: [PATCH 10/13] Update requirements.txt

---
 requirements.txt | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 8dbe7cc..2d3749d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,12 @@
-transformers==4.20.0
-datasets>=2.2.0
-pyarrow==8.0.0
-jsonlines
-numpy
-requests>=2.26.0
+transformers>=4.20,<5.0
+datasets>=2.20,<3.0
+pyarrow>=17.0,<18.0
+numpy>=1.0,<=1.26.4
+requests>=2.26,<3.0
 scikit_learn>=1.0.2
-scipy
-sentence_transformers>=3.0.1
-torch
-tqdm
-rich
-tensorboard
-huggingface-hub>=0.19.0
+scipy>=1.14,<2.0
+sentence-transformers>=3.0.1,<4.0
+torch>=2.0
+tqdm>=4.0,<5.0
+rich>=13.0,<14.0
+huggingface-hub==0.24.1

From 79de7a6b22b93f1998552bdadd7739e6300698a8 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 18:05:20 -0400
Subject: [PATCH 11/13] Update requirements.txt

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 2d3749d..55652ee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,10 +3,10 @@ datasets>=2.20,<3.0
 pyarrow>=17.0,<18.0
 numpy>=1.0,<=1.26.4
 requests>=2.26,<3.0
-scikit_learn>=1.0.2
+scikit_learn>=1.0.2,<2.0
 scipy>=1.14,<2.0
 sentence-transformers>=3.0.1,<4.0
 torch>=2.0
 tqdm>=4.0,<5.0
 rich>=13.0,<14.0
-huggingface-hub==0.24.1
+huggingface-hub>=0.24.1

From 882336b449c5df41cd84063b3daf5b563aa4d94f Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 18:09:58 -0400
Subject: [PATCH 12/13] Update README.md

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4288d09..8e3d3b3 100644
--- a/README.md
+++ b/README.md
@@ -2,10 +2,9 @@
 
 This is a fork for the Instructor model becuase the original repository isn't kept up anymore.  I've also made some improvements to their source code:
 
-1) Fixing it to work with versions of the ```sentence-transformers``` library after version 2.2.2
-   > The original only supports version 2.2.2.
-2) Properly download the models from huggingface.
-3) Ability to use the model downloaded to a folder on your computer - not ALWAYS having to download from Huggingface.
+1) Fixing it to work with the ```sentence-transformers``` library above 2.2.2.
+2) Properly download the models from huggingface using the new "snapshot download" API.
+3) Ability to specify where you want the model donwloaded with the "cache_dir" parameter.
 
 ## What follows is the original repository's readme file.  Ignore the quantization section, however, becuase pytorch has changed its API since then.
 

From ae975c402e255e21e33339b242d3ebcd46cd5fa5 Mon Sep 17 00:00:00 2001
From: BBC-Esq <bbc@chintellalaw.com>
Date: Sat, 24 Aug 2024 18:34:01 -0400
Subject: [PATCH 13/13] Update instructor.py

---
 InstructorEmbedding/instructor.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py
index 0631902..72b3df2 100644
--- a/InstructorEmbedding/instructor.py
+++ b/InstructorEmbedding/instructor.py
@@ -23,7 +23,7 @@ def batch_to_device(batch, target_device: str):
     return batch
 
 
-class INSTRUCTOR_Pooling(nn.Module):
+class INSTRUCTORPooling(nn.Module):
     """Performs pooling (max or mean) on the token embeddings.
 
     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding.
@@ -245,7 +245,7 @@ def load(input_path):
         ) as config_file:
             config = json.load(config_file)
 
-        return INSTRUCTOR_Pooling(**config)
+        return INSTRUCTORPooling(**config)
 
 
 def import_from_string(dotted_path):
@@ -536,13 +536,6 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision=
                 "cache_dir": cache_folder,
                 "tqdm_class": disabled_tqdm,
             }
-            # Try to download from the remote
-            try:
-                model_path = snapshot_download(**download_kwargs)
-            except Exception:
-                # Otherwise, try local (i.e. cache) only
-                download_kwargs["local_files_only"] = True
-                model_path = snapshot_download(**download_kwargs)
 
         # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework)
         config_sentence_transformers_json_path = os.path.join(
@@ -573,7 +566,7 @@ def _load_sbert_model(self, model_path, token=None, cache_folder=None, revision=
             if module_config["idx"] == 0:
                 module_class = INSTRUCTORTransformer
             elif module_config["idx"] == 1:
-                module_class = INSTRUCTOR_Pooling
+                module_class = INSTRUCTORPooling
             else:
                 module_class = import_from_string(module_config["type"])
             module = module_class.load(os.path.join(model_path, module_config["path"]))