From 8a883c4255864c9ba8d567b8e3fa5e42236642e0 Mon Sep 17 00:00:00 2001 From: Lasse Hansen Date: Mon, 31 Jul 2023 10:29:17 +0200 Subject: [PATCH 1/2] docs: minor updates to index page --- docs/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/index.md b/docs/index.md index f5394edc..b461e66e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,13 +6,13 @@ hide: # Scandinavian Embedding Benchmark -This is the documentation for the Scandinavian Embedding Benchmark. This benchmark is intended to evaluate the sentence/documents embeddings of large language models. +This is the documentation for the Scandinavian Embedding Benchmark. This benchmark is intended to evaluate the sentence/document embeddings of large language models. Intended uses for this benchmark: - Evaluating document embeddings of Scandinavian language models - Evaluating document embeddings for multilingual models on Scandinavian languages -- Allow ranking of competing Scandinavian and multilingual models using no more compute that what a consumer laptop can provide +- Allow ranking of competing Scandinavian and multilingual models using no more compute than what a consumer laptop can provide === "All" @@ -34,9 +34,9 @@ Intended uses for this benchmark: ## Comparison to other benchmarks -If you use this benchmark for a relative ranking of language models you should also take a look at [ScandEval](https://scandeval.github.io), which as opposed the this benchmark fully fine-tunes the models. It also includes structured predictions tasks such as named entity recognition. Many of the tasks in this embeddings benchmark is also included in ScandEval. A notable difference between the ScandEval and this benchmark is that it does not include machine translated tasks. +If you use this benchmark for a relative ranking of language models you should also look at [ScandEval](https://scandeval.github.io), which as opposed to this benchmark fully fine-tunes the models. It also includes structured prediction tasks such as named entity recognition. Many of the tasks in this embedding benchmark are also included in ScandEval. A notable difference between ScandEval and this benchmark is that this one does not include machine-translated tasks. -The tasks within this benchmark is also included in the [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard, though the aggregations methods very slightly. The MTEB is primarily an English embedding benchmark, with a few multilingual tasks along with a few additional languages. As a part of this project the tasks was also added to the MTEB leaderboard. +The tasks within this benchmark are also included in the [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard, though the aggregation methods very slightly. MTEB is primarily an English embedding benchmark, with a few multilingual tasks and additional languages. As a part of this project, the tasks were also added to the MTEB leaderboard. From 6e4ae7afb9829984d62798d2e5657b0d88c93fb8 Mon Sep 17 00:00:00 2001 From: Lasse Hansen Date: Mon, 31 Jul 2023 10:37:55 +0200 Subject: [PATCH 2/2] feat: add multilingual sentence transformer --- src/seb/seb_models.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/seb/seb_models.py b/src/seb/seb_models.py index c6dab544..66de289e 100644 --- a/src/seb/seb_models.py +++ b/src/seb/seb_models.py @@ -41,6 +41,19 @@ def create_all_mini_lm_l6_v2() -> SebModel: meta=meta, ) +@models.register("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") +def create_multilingual_mini_lm_l12_v2() -> SebModel: + hf_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" + meta = ModelMeta( + name=hf_name.split("/")[-1], + huggingface_name=hf_name, + reference=f"https://huggingface.co/{hf_name}", + languages=[], + ) + return SebModel( + loader=partial(get_sentence_transformer, model_name=hf_name), # type: ignore + meta=meta, + ) @models.register("KBLab/sentence-bert-swedish-cased") def create_sentence_swedish_cased() -> SebModel: