Skip to content

Commit

Permalink
Merge branch 'add-models-and-muni-code' of https://github.com/Kenneth…
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Jul 18, 2024
2 parents 3703bac + 6d03507 commit 8f523f1
Show file tree
Hide file tree
Showing 13 changed files with 18 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ The following tables contains description of all the dataset in the benchmark al
| [SwednClustering](https://spraakbanken.gu.se/en/resources/swedn) | CC-BY-4.0 |
| [SwednRetrieval](https://spraakbanken.gu.se/en/resources/swedn) | CC-BY-4.0 |
| [TV2Nord Retrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | Apache 2.0 |
| [Twitterhjerne](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) | Upcoming |
| [Twitterhjerne](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) | CC BY 4.0 |
| [VG Clustering](https://huggingface.co/datasets/navjordj/VG_summarization) | CC-BY-NC |

## Dataset Disclaimer
Expand Down
2 changes: 1 addition & 1 deletion src/seb/cache/bge-m3/DKHate.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.1.1","time_of_run":"2024-07-18T16:37:32.748464","scores":{"da":{"accuracy":0.6860182370820669,"f1":0.5632807409542002,"ap":0.19409679978286035,"accuracy_stderr":0.06900499397550017,"f1_stderr":0.055428928758052075,"ap_stderr":0.03746255122643311,"main_score":0.6860182370820669}},"main_score":"accuracy"}
{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.1.1","time_of_run":"2024-07-18T16:37:32.748464","scores":{"da":{"accuracy":0.6860182370820669,"f1":0.5632807409542002,"ap":0.19409679978286035,"accuracy_stderr":0.06900499397550017,"f1_stderr":0.055428928758052075,"ap_stderr":0.03746255122643311,"main_score":0.6860182370820669}},"main_score":"accuracy"}
2 changes: 1 addition & 1 deletion src/seb/cache/bge-m3/Da_Political_Comments.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"task_name":"Da Political Comments","task_description":"A dataset of Danish political comments rated for sentiment","task_version":"1.1.1","time_of_run":"2024-07-18T16:38:33.297933","scores":{"da":{"accuracy":0.41409544950055494,"f1":0.39094114617581444,"accuracy_stderr":0.030433532391366303,"f1_stderr":0.02461312406649866,"main_score":0.41409544950055494}},"main_score":"accuracy"}
{"task_name":"Da Political Comments","task_description":"A dataset of Danish political comments rated for sentiment","task_version":"1.1.1","time_of_run":"2024-07-18T16:38:33.297933","scores":{"da":{"accuracy":0.41409544950055494,"f1":0.39094114617581444,"accuracy_stderr":0.030433532391366303,"f1_stderr":0.02461312406649866,"main_score":0.41409544950055494}},"main_score":"accuracy"}
2 changes: 1 addition & 1 deletion src/seb/cache/bge-m3/DanFEVER.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-07-18T16:42:51.817564","scores":{"da":{"ndcg_at_1":0.31524,"ndcg_at_3":0.39278,"ndcg_at_5":0.40088,"ndcg_at_10":0.40437,"ndcg_at_100":0.40604,"ndcg_at_1000":0.40609,"map_at_1":0.31508,"map_at_3":0.37438,"map_at_5":0.37893,"map_at_10":0.38042,"map_at_100":0.38081,"map_at_1000":0.38081,"recall_at_1":0.31508,"recall_at_3":0.44563,"recall_at_5":0.46509,"recall_at_10":0.4756,"recall_at_100":0.48298,"recall_at_1000":0.48329,"precision_at_1":0.31524,"precision_at_3":0.14865,"precision_at_5":0.09308,"precision_at_10":0.04759,"precision_at_100":0.00483,"precision_at_1000":0.00048,"mrr_at_1":0.31524,"mrr_at_3":0.3745,"mrr_at_5":0.37905,"mrr_at_10":0.38054,"mrr_at_100":0.3809,"mrr_at_1000":0.38091}},"main_score":"ndcg_at_10"}
{"task_name":"DanFEVER","task_description":"A Danish dataset intended for misinformation research. It follows the same format as the English FEVER dataset.","task_version":"1.1.1","time_of_run":"2024-07-18T16:42:51.817564","scores":{"da":{"ndcg_at_1":0.31524,"ndcg_at_3":0.39278,"ndcg_at_5":0.40088,"ndcg_at_10":0.40437,"ndcg_at_100":0.40604,"ndcg_at_1000":0.40609,"map_at_1":0.31508,"map_at_3":0.37438,"map_at_5":0.37893,"map_at_10":0.38042,"map_at_100":0.38081,"map_at_1000":0.38081,"recall_at_1":0.31508,"recall_at_3":0.44563,"recall_at_5":0.46509,"recall_at_10":0.4756,"recall_at_100":0.48298,"recall_at_1000":0.48329,"precision_at_1":0.31524,"precision_at_3":0.14865,"precision_at_5":0.09308,"precision_at_10":0.04759,"precision_at_100":0.00483,"precision_at_1000":0.00048,"mrr_at_1":0.31524,"mrr_at_3":0.3745,"mrr_at_5":0.37905,"mrr_at_10":0.38054,"mrr_at_100":0.3809,"mrr_at_1000":0.38091}},"main_score":"ndcg_at_10"}
1 change: 1 addition & 0 deletions src/seb/cache/voyage-multilingual-2/DaLAJ.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DaLAJ","task_description":"A Swedish dataset for linguistic acceptability. Available as a part of Superlim.","task_version":"1.1.1","time_of_run":"2024-07-18T16:16:38.549434","scores":{"sv":{"accuracy":0.49887387387387383,"f1":0.4921083698393588,"ap":0.4994764896602698,"accuracy_stderr":0.005851522998543496,"f1_stderr":0.0066776318244648365,"ap_stderr":0.0029215131489481816,"main_score":0.49887387387387383}},"main_score":"accuracy"}
1 change: 1 addition & 0 deletions src/seb/cache/voyage-multilingual-2/NorQuad.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"NorQuad","task_description":"Human-created question for Norwegian wikipedia passages.","task_version":"0.0.1","time_of_run":"2024-07-18T16:13:51.731279","scores":{"nb":{"ndcg_at_1":0.37598,"ndcg_at_3":0.30162,"ndcg_at_5":0.3211,"ndcg_at_10":0.3391,"ndcg_at_100":0.37974,"ndcg_at_1000":0.41129,"map_at_1":0.18799,"map_at_3":0.24495,"map_at_5":0.25582,"map_at_10":0.26312,"map_at_100":0.27083,"map_at_1000":0.27195,"recall_at_1":0.18799,"recall_at_3":0.28564,"recall_at_5":0.32422,"recall_at_10":0.36914,"recall_at_100":0.53027,"recall_at_1000":0.75049,"precision_at_1":0.37598,"precision_at_3":0.19043,"precision_at_5":0.12969,"precision_at_10":0.07383,"precision_at_100":0.01061,"precision_at_1000":0.0015,"mrr_at_1":0.37598,"mrr_at_3":0.4388,"mrr_at_5":0.45233,"mrr_at_10":0.46234,"mrr_at_100":0.4721,"mrr_at_1000":0.4726}},"main_score":"ndcg_at_10"}
1 change: 1 addition & 0 deletions src/seb/cache/voyage-multilingual-2/SNL_Retrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SNL Retrieval","task_description":"Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.","task_version":"0.0.1","time_of_run":"2024-07-18T15:55:35.028913","scores":{"nb":{"ndcg_at_1":0.93385,"ndcg_at_3":0.95943,"ndcg_at_5":0.96427,"ndcg_at_10":0.96549,"ndcg_at_100":0.96689,"ndcg_at_1000":0.96708,"map_at_1":0.93385,"map_at_3":0.95359,"map_at_5":0.95632,"map_at_10":0.95682,"map_at_100":0.95715,"map_at_1000":0.95715,"recall_at_1":0.93385,"recall_at_3":0.97615,"recall_at_5":0.98769,"recall_at_10":0.99154,"recall_at_100":0.99769,"recall_at_1000":0.99923,"precision_at_1":0.93385,"precision_at_3":0.32538,"precision_at_5":0.19754,"precision_at_10":0.09915,"precision_at_100":0.00998,"precision_at_1000":0.001,"mrr_at_1":0.93385,"mrr_at_3":0.95359,"mrr_at_5":0.95632,"mrr_at_10":0.95682,"mrr_at_100":0.95715,"mrr_at_1000":0.95715}},"main_score":"ndcg_at_10"}
1 change: 1 addition & 0 deletions src/seb/cache/voyage-multilingual-2/SweFAQ.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SweFAQ","task_description":"A Swedish QA dataset derived from FAQ","task_version":"0.0.1","time_of_run":"2024-07-18T16:17:08.666028","scores":{"sv":{"ndcg_at_1":0.69006,"ndcg_at_3":0.80374,"ndcg_at_5":0.82379,"ndcg_at_10":0.83728,"ndcg_at_100":0.84476,"ndcg_at_1000":0.84476,"map_at_1":0.69006,"map_at_3":0.77745,"map_at_5":0.78856,"map_at_10":0.79428,"map_at_100":0.79617,"map_at_1000":0.79617,"recall_at_1":0.69006,"recall_at_3":0.87914,"recall_at_5":0.92788,"recall_at_10":0.96881,"recall_at_100":1.0,"recall_at_1000":1.0,"precision_at_1":0.69006,"precision_at_3":0.29305,"precision_at_5":0.18558,"precision_at_10":0.09688,"precision_at_100":0.01,"precision_at_1000":0.001,"mrr_at_1":0.69006,"mrr_at_3":0.77745,"mrr_at_5":0.78856,"mrr_at_10":0.79428,"mrr_at_100":0.79617,"mrr_at_1000":0.79617}},"main_score":"ndcg_at_10"}
1 change: 1 addition & 0 deletions src/seb/cache/voyage-multilingual-2/SweReC.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SweReC","task_description":"A Swedish dataset for sentiment classification on review","task_version":"1.1.1","time_of_run":"2024-07-18T16:16:06.89844","scores":{"sv":{"accuracy":0.802880859375,"f1":0.7221135411440635,"accuracy_stderr":0.025201226065343193,"f1_stderr":0.019726276389694458,"main_score":0.802880859375}},"main_score":"accuracy"}
1 change: 1 addition & 0 deletions src/seb/cache/voyage-multilingual-2/SwednClustering.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SwednClustering","task_description":"The SWE-DN corpus is based on 1,963,576 news articles from the Swedish newspaper Dagens Nyheter (DN) during the years 2000--2020. The articles are filtered to resemble the CNN/DailyMail dataset both regarding textual structure. This dataset uses the category labels as clusters.","task_version":"0.0.1","time_of_run":"2024-07-18T16:34:49.974055","scores":{"sv":{"v_measure":0.08902877284136113,"v_measure_std":0.09078921228238793}},"main_score":"v_measure"}
1 change: 1 addition & 0 deletions src/seb/cache/voyage-multilingual-2/SwednRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-07-18T16:27:46.291172","scores":{"sv":{"ndcg_at_1":0.78125,"ndcg_at_3":0.76346,"ndcg_at_5":0.79418,"ndcg_at_10":0.81572,"ndcg_at_100":0.83475,"ndcg_at_1000":0.84056,"map_at_1":0.39062,"map_at_3":0.72778,"map_at_5":0.75259,"map_at_10":0.76568,"map_at_100":0.77138,"map_at_1000":0.77167,"recall_at_1":0.39062,"recall_at_3":0.77295,"recall_at_5":0.83301,"recall_at_10":0.88672,"recall_at_100":0.95752,"recall_at_1000":0.99512,"precision_at_1":0.78125,"precision_at_3":0.5153,"precision_at_5":0.3332,"precision_at_10":0.17734,"precision_at_100":0.01915,"precision_at_1000":0.00199,"mrr_at_1":0.78125,"mrr_at_3":0.82438,"mrr_at_5":0.83112,"mrr_at_10":0.83441,"mrr_at_100":0.83651,"mrr_at_1000":0.83662}},"main_score":"ndcg_at_10"}
1 change: 1 addition & 0 deletions src/seb/registered_models/sentence_transformer_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ def create_use_cmlm_multilingual() -> SebModel:

if __name__ == "__main__":
import seb

model = seb.get_model("mxbai-embed-large-v1")
test = model.encoder.encode(["Hello world", "test"])
test.shape
9 changes: 6 additions & 3 deletions src/seb/registered_models/voyage_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def __init__(
**kwargs: Any, # noqa: ARG002
) -> None:
try:
import voyageai
import voyageai # type: ignore
except ImportError as e:
raise ImportError("Please install voyageai to use this model using `pip install 'seb[voyageai]'`") from e

Expand Down Expand Up @@ -115,13 +115,15 @@ def _batched_encode(
batch_size: int,
input_type: Literal["query", "document"],
) -> np.ndarray:
batch_size = 32 # The used version on MTEB accidentally overwrites the batch_size parameter
embeddings, index = [], 0

while index <= len(sentences) - 1:
batch, batch_tokens = [], 0
while index < len(sentences) and len(batch) < batch_size and batch_tokens < self._max_tpm:
batch_tokens += len(self._client.tokenize([sentences[index]], model=self._model_name))
n_tokens = len(self._client.tokenize([sentences[index]], model=self._model_name)[0])
if batch_tokens + n_tokens > self._max_tpm:
break
batch_tokens += n_tokens
batch.append(sentences[index])
index += 1

Expand All @@ -130,6 +132,7 @@ def _batched_encode(
texts=batch,
model=self._model_name,
input_type=input_type,
truncation=True,
).embeddings
)

Expand Down

0 comments on commit 8f523f1

Please sign in to comment.