diff --git a/src/seb/cache/BAAI__bge-m3/Angry_Tweets.json b/src/seb/cache/BAAI__bge-m3/Angry_Tweets.json new file mode 100644 index 00000000..c2425434 --- /dev/null +++ b/src/seb/cache/BAAI__bge-m3/Angry_Tweets.json @@ -0,0 +1,16 @@ +{ + "task_name": "Angry Tweets", + "task_description": "A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets", + "task_version": "1.1.1", + "time_of_run": "2024-07-18T14:17:30.210659", + "scores": { + "da": { + "accuracy": 0.5744030563514804, + "f1": 0.5639609605319712, + "accuracy_stderr": 0.0232317583970707, + "f1_stderr": 0.02040731181518541, + "main_score": 0.5744030563514804 + } + }, + "main_score": "accuracy" +} \ No newline at end of file diff --git a/src/seb/cache/bge-m3/Bornholm_Parallel.json b/src/seb/cache/BAAI__bge-m3/Bornholm_Parallel.json similarity index 100% rename from src/seb/cache/bge-m3/Bornholm_Parallel.json rename to src/seb/cache/BAAI__bge-m3/Bornholm_Parallel.json diff --git a/src/seb/cache/bge-m3/DKHate.json b/src/seb/cache/BAAI__bge-m3/DKHate.json similarity index 100% rename from src/seb/cache/bge-m3/DKHate.json rename to src/seb/cache/BAAI__bge-m3/DKHate.json diff --git a/src/seb/cache/bge-m3/Da_Political_Comments.json b/src/seb/cache/BAAI__bge-m3/Da_Political_Comments.json similarity index 100% rename from src/seb/cache/bge-m3/Da_Political_Comments.json rename to src/seb/cache/BAAI__bge-m3/Da_Political_Comments.json diff --git a/src/seb/cache/bge-m3/DanFEVER.json b/src/seb/cache/BAAI__bge-m3/DanFEVER.json similarity index 100% rename from src/seb/cache/bge-m3/DanFEVER.json rename to src/seb/cache/BAAI__bge-m3/DanFEVER.json diff --git a/src/seb/cache/bge-m3/LCC.json b/src/seb/cache/BAAI__bge-m3/LCC.json similarity index 100% rename from src/seb/cache/bge-m3/LCC.json rename to src/seb/cache/BAAI__bge-m3/LCC.json diff --git a/src/seb/cache/bge-m3/Language_Identification.json b/src/seb/cache/BAAI__bge-m3/Language_Identification.json similarity index 100% rename from src/seb/cache/bge-m3/Language_Identification.json rename to src/seb/cache/BAAI__bge-m3/Language_Identification.json diff --git a/src/seb/cache/bge-m3/Massive_Intent.json b/src/seb/cache/BAAI__bge-m3/Massive_Intent.json similarity index 100% rename from src/seb/cache/bge-m3/Massive_Intent.json rename to src/seb/cache/BAAI__bge-m3/Massive_Intent.json diff --git a/src/seb/cache/bge-m3/Massive_Scenario.json b/src/seb/cache/BAAI__bge-m3/Massive_Scenario.json similarity index 100% rename from src/seb/cache/bge-m3/Massive_Scenario.json rename to src/seb/cache/BAAI__bge-m3/Massive_Scenario.json diff --git a/src/seb/cache/bge-m3/NoReC.json b/src/seb/cache/BAAI__bge-m3/NoReC.json similarity index 100% rename from src/seb/cache/bge-m3/NoReC.json rename to src/seb/cache/BAAI__bge-m3/NoReC.json diff --git a/src/seb/cache/bge-m3/Norwegian_courts.json b/src/seb/cache/BAAI__bge-m3/Norwegian_courts.json similarity index 100% rename from src/seb/cache/bge-m3/Norwegian_courts.json rename to src/seb/cache/BAAI__bge-m3/Norwegian_courts.json diff --git a/src/seb/cache/bge-m3/Norwegian_parliament.json b/src/seb/cache/BAAI__bge-m3/Norwegian_parliament.json similarity index 100% rename from src/seb/cache/bge-m3/Norwegian_parliament.json rename to src/seb/cache/BAAI__bge-m3/Norwegian_parliament.json diff --git a/src/seb/cache/bge-m3/SNL_Clustering.json b/src/seb/cache/BAAI__bge-m3/SNL_Clustering.json similarity index 100% rename from src/seb/cache/bge-m3/SNL_Clustering.json rename to src/seb/cache/BAAI__bge-m3/SNL_Clustering.json diff --git a/src/seb/cache/bge-m3/ScaLA.json b/src/seb/cache/BAAI__bge-m3/ScaLA.json similarity index 100% rename from src/seb/cache/bge-m3/ScaLA.json rename to src/seb/cache/BAAI__bge-m3/ScaLA.json diff --git a/src/seb/cache/bge-m3/TV2Nord_Retrieval.json b/src/seb/cache/BAAI__bge-m3/TV2Nord_Retrieval.json similarity index 100% rename from src/seb/cache/bge-m3/TV2Nord_Retrieval.json rename to src/seb/cache/BAAI__bge-m3/TV2Nord_Retrieval.json diff --git a/src/seb/cache/bge-m3/Twitterhjerne.json b/src/seb/cache/BAAI__bge-m3/Twitterhjerne.json similarity index 100% rename from src/seb/cache/bge-m3/Twitterhjerne.json rename to src/seb/cache/BAAI__bge-m3/Twitterhjerne.json diff --git a/src/seb/cache/bge-m3/VG_Clustering.json b/src/seb/cache/BAAI__bge-m3/VG_Clustering.json similarity index 100% rename from src/seb/cache/bge-m3/VG_Clustering.json rename to src/seb/cache/BAAI__bge-m3/VG_Clustering.json diff --git a/src/seb/cache/bge-m3/Angry_Tweets.json b/src/seb/cache/bge-m3/Angry_Tweets.json deleted file mode 100644 index 6fff2d9a..00000000 --- a/src/seb/cache/bge-m3/Angry_Tweets.json +++ /dev/null @@ -1 +0,0 @@ -{"task_name":"Angry Tweets","task_description":"A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets","task_version":"1.1.1","time_of_run":"2024-07-18T14:17:30.210659","scores":{"da":{"accuracy":0.5744030563514804,"f1":0.5639609605319712,"accuracy_stderr":0.0232317583970707,"f1_stderr":0.02040731181518541,"main_score":0.5744030563514804}},"main_score":"accuracy"} \ No newline at end of file diff --git a/src/seb/registered_models/__init__.py b/src/seb/registered_models/__init__.py index e6ab49c8..d5dcdc4f 100644 --- a/src/seb/registered_models/__init__.py +++ b/src/seb/registered_models/__init__.py @@ -7,3 +7,4 @@ from .openai_models import * from .translate_e5_models import * from .voyage_models import * +from .bge_models import * diff --git a/src/seb/registered_models/bge_models.py b/src/seb/registered_models/bge_models.py index 2b4da7d3..5a7566ad 100644 --- a/src/seb/registered_models/bge_models.py +++ b/src/seb/registered_models/bge_models.py @@ -33,7 +33,10 @@ def encode( # type: ignore if "task" in kwargs: kwargs.pop("task") - return np.asarray(self.mdl.encode(sentences, batch_size=batch_size, **kwargs)) + if "convert_to_tensor" in kwargs: + kwargs.pop("convert_to_tensor") + + return np.asarray(self.mdl.encode(sentences, batch_size=batch_size, convert_to_numpy=True, **kwargs)) def encode_queries(self, queries: list[str], batch_size: int = 32, **kwargs: Any) -> np.ndarray: if "task" in kwargs: @@ -61,17 +64,17 @@ def encode_corpus( return emb # type: ignore -@models.register("BAAI/bge-m3") +@models.register("bge-m3") def create_bge_m3() -> SebModel: hf_name = "BAAI/bge-m3" meta = ModelMeta( - name=hf_name, + name="bge-m3", huggingface_name=hf_name, reference=f"https://huggingface.co/{hf_name}", languages=[], - open_source=False, + open_source=True, embedding_size=1024, - architecture="API", + architecture="XLM-R", release_date=date(2024, 5, 28), ) return SebModel( diff --git a/src/seb/registered_models/sentence_transformer_models.py b/src/seb/registered_models/sentence_transformer_models.py index 803c5217..44969afd 100644 --- a/src/seb/registered_models/sentence_transformer_models.py +++ b/src/seb/registered_models/sentence_transformer_models.py @@ -494,9 +494,8 @@ def create_use_cmlm_multilingual() -> SebModel: name=hf_name.split("/")[-1], huggingface_name=hf_name, reference=f"https://huggingface.co/{hf_name}", - languages=["da"], open_source=True, - embedding_size=768, + embedding_size=768, architecture="BERT", release_date=date(2022, 4, 14), )