Merge pull request #82 from KennethEnevoldsen/add-retrieval-swedn

feat: Added SwednRetrieval task
KennethEnevoldsen · Jan 23, 2024 · d5f959d · d5f959d
2 parents 91e64c6 + 7fe3371
commit d5f959d
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 7 deletions.
diff --git a/src/seb/cache/intfloat__multilingual-e5-base/SwednRetrieval.json b/src/seb/cache/intfloat__multilingual-e5-base/SwednRetrieval.json
@@ -0,0 +1 @@
+{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-01-22T21:05:10.182137","scores":{"sv":{"ndcg_at_1":0.549,"ndcg_at_3":0.52531,"ndcg_at_5":0.57173,"ndcg_at_10":0.60683,"ndcg_at_100":0.65156,"ndcg_at_1000":0.66375,"map_at_1":0.2745,"map_at_3":0.46698,"map_at_5":0.50327,"map_at_10":0.5235,"map_at_100":0.5362,"map_at_1000":0.5368,"recall_at_1":0.2745,"recall_at_3":0.53445,"recall_at_5":0.62563,"recall_at_10":0.71389,"recall_at_100":0.88598,"recall_at_1000":0.96569,"precision_at_1":0.549,"precision_at_3":0.3563,"precision_at_5":0.25025,"precision_at_10":0.14278,"precision_at_100":0.01772,"precision_at_1000":0.00193,"mrr_at_1":0.54873,"mrr_at_3":0.63218,"mrr_at_5":0.64433,"mrr_at_10":0.65131,"mrr_at_100":0.65544,"mrr_at_1000":0.65564}},"main_score":"ndcg_at_10"}
diff --git a/src/seb/cache/intfloat__multilingual-e5-small/SwednRetrieval.json b/src/seb/cache/intfloat__multilingual-e5-small/SwednRetrieval.json
@@ -0,0 +1 @@
+{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-01-22T21:13:25.659702","scores":{"sv":{"ndcg_at_1":0.5223,"ndcg_at_3":0.50655,"ndcg_at_5":0.54827,"ndcg_at_10":0.58265,"ndcg_at_100":0.62707,"ndcg_at_1000":0.64137,"map_at_1":0.26115,"map_at_3":0.45145,"map_at_5":0.48354,"map_at_10":0.50308,"map_at_100":0.51484,"map_at_1000":0.51554,"recall_at_1":0.26115,"recall_at_3":0.51682,"recall_at_5":0.59907,"recall_at_10":0.68478,"recall_at_100":0.85808,"recall_at_1000":0.9518,"precision_at_1":0.5223,"precision_at_3":0.34455,"precision_at_5":0.23963,"precision_at_10":0.13696,"precision_at_100":0.01716,"precision_at_1000":0.0019,"mrr_at_1":0.5223,"mrr_at_3":0.60516,"mrr_at_5":0.61691,"mrr_at_10":0.62487,"mrr_at_100":0.62973,"mrr_at_1000":0.62996}},"main_score":"ndcg_at_10"}
diff --git a/src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/SwednRetrieval.json b/src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/SwednRetrieval.json
@@ -0,0 +1 @@
+{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-01-22T20:47:16.683158","scores":{"sv":{"ndcg_at_1":0.18425,"ndcg_at_3":0.15957,"ndcg_at_5":0.17453,"ndcg_at_10":0.19174,"ndcg_at_100":0.23454,"ndcg_at_1000":0.27441,"map_at_1":0.09212,"map_at_3":0.12915,"map_at_5":0.13795,"map_at_10":0.14589,"map_at_100":0.15436,"map_at_1000":0.15585,"recall_at_1":0.09212,"recall_at_3":0.15714,"recall_at_5":0.18678,"recall_at_10":0.23004,"recall_at_100":0.40427,"recall_at_1000":0.6741,"precision_at_1":0.18425,"precision_at_3":0.10476,"precision_at_5":0.07471,"precision_at_10":0.04601,"precision_at_100":0.00809,"precision_at_1000":0.00135,"mrr_at_1":0.18398,"mrr_at_3":0.22474,"mrr_at_5":0.2342,"mrr_at_10":0.24161,"mrr_at_100":0.24946,"mrr_at_1000":0.25047}},"main_score":"ndcg_at_10"}
diff --git a/src/seb/registered_models/e5_models.py b/src/seb/registered_models/e5_models.py
@@ -26,12 +26,12 @@ def encode(
         self,
         sentences: list[str],
         *,
-        task: Task,  # noqa: ARG002
+        task: Task,
         batch_size: int = 32,
         **kwargs: Any,
     ) -> ArrayLike:
         sentences = self.preprocess(sentences)
-        return self.mdl.encode(sentences, batch_size=batch_size, **kwargs)  # type: ignore
+        return self.mdl.encode(sentences, batch_size=batch_size, task=task, **kwargs)  # type: ignore
 
 
 # English

diff --git a/src/seb/registered_tasks/mteb_tasks.py b/src/seb/registered_tasks/mteb_tasks.py
@@ -155,12 +155,11 @@ def description(self) -> dict[str, Any]:
             "hf_hub_name": "sbx/superlim-2",
             "description": "News Article Summary Semantic Similarity Estimation.",
             "reference": "https://spraakbanken.gu.se/en/resources/swedn",
-            "type": "STS",
+            "type": "Retrieval",
             "category": "p2p",
             "eval_splits": ["test"],
             "eval_langs": ["sv"],
             "main_score": "spearman",
-            # All pairs are considered to be semantically similar (score=1)
             "min_score": 0,
             "max_score": 1,
             "revision": "ef1661775d746e0844b299164773db733bdc0bf6",
@@ -179,6 +178,81 @@ def sattolo_cycle(items: list[T]) -> list[T]:
         return items
 
 
+class SwednRetrieval(AbsTaskRetrieval):
+    @property
+    def description(self) -> dict[str, Any]:
+        return {
+            "name": "Swedn",
+            "hf_hub_name": "sbx/superlim-2",
+            "description": "News Article Summary Semantic Similarity Estimation.",
+            "reference": "https://spraakbanken.gu.se/en/resources/swedn",
+            "type": "STS",
+            "category": "p2p",
+            "eval_splits": ["test"],
+            "eval_langs": ["sv"],
+            "main_score": "ndcg_at_10",
+            "revision": "ef1661775d746e0844b299164773db733bdc0bf6",
+        }
+
+    def load_data(self, **kwargs: dict):  # noqa: ARG002
+        """
+        Load dataset from HuggingFace hub
+        """
+        if self.data_loaded:
+            return
+
+        self.dataset: datasets.DatasetDict = datasets.load_dataset(
+            self.description["hf_hub_name"],
+            "swedn",  # chose the relevant subset
+            revision=self.description.get("revision"),
+        )  # type: ignore
+
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self) -> None:
+        """
+        and transform to a retrieval datset, which have the following attributes
+
+        self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
+        self.queries = Dict[query_id, str] #id => query
+        self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
+        """
+        self.corpus = {}
+        self.relevant_docs = {}
+        self.queries = {}
+        text2id = {}
+
+        for split in self.dataset:
+            ds: datasets.Dataset = self.dataset[split]  # type: ignore
+            self.queries[split] = {}
+            self.relevant_docs[split] = {}
+            self.corpus[split] = {}
+
+            headline = ds["headline"]
+            summary = ds["summary"]
+            article = ds["article"]
+
+            n = 0
+            for headl, summ, art in zip(headline, summary, article):
+                self.queries[split][str(n)] = headl
+                q_n = n
+                n += 1
+                if summ not in text2id:
+                    text2id[summ] = n
+                    self.corpus[split][str(n)] = {"title": "", "text": summ}
+                    n += 1
+                if art not in text2id:
+                    text2id[art] = n
+                    self.corpus[split][str(n)] = {"title": "", "text": art}
+                    n += 1
+                cor_n = text2id[art]
+
+                self.relevant_docs[split][str(q_n)] = {
+                    str(text2id[art]): 1, str(text2id[summ]): 1
+                }  # only two correct matches
+
+
 class NorwegianCourtsBitextMining(AbsTaskBitextMining):
     @property
     def description(self) -> dict[str, Any]:
@@ -216,3 +290,4 @@ def dataset_transform(self) -> None:
         # Convert to standard format
         self.dataset = self.dataset.rename_column("nb", "sentence1")
         self.dataset = self.dataset.rename_column("nn", "sentence2")
+
diff --git a/src/seb/registered_tasks/swedish.py b/src/seb/registered_tasks/swedish.py
@@ -36,12 +36,23 @@ def create_swefaq() -> Task:
 
 
 # temporarily disabled - will be added back in the future (along with the new datasets)
-# @tasks.register("Swedn")
-def create_swedn() -> Task:
+# @tasks.register("SwednSTS")
+def create_swedn_sts() -> Task:
     from seb.registered_tasks.mteb_tasks import SwednSummarizationSTS
 
     task = MTEBTask(SwednSummarizationSTS())
-    task.name = "Swedn"
+    task.name = "SwednSTS"
+    task.version = "0.0.1"
+    task.domain = ["non-fiction", "news"]
+    return task
+
+
+# @tasks.register("SwednRetrieval")
+def create_swedn_retrieval() -> Task:
+    from seb.registered_tasks.mteb_tasks import SwednRetrieval
+
+    task = MTEBTask(SwednRetrieval())
+    task.name = "SwednRetrieval"
     task.version = "0.0.1"
     task.domain = ["non-fiction", "news"]
     return task
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-01-22T21:05:10.182137","scores":{"sv":{"ndcg_at_1":0.549,"ndcg_at_3":0.52531,"ndcg_at_5":0.57173,"ndcg_at_10":0.60683,"ndcg_at_100":0.65156,"ndcg_at_1000":0.66375,"map_at_1":0.2745,"map_at_3":0.46698,"map_at_5":0.50327,"map_at_10":0.5235,"map_at_100":0.5362,"map_at_1000":0.5368,"recall_at_1":0.2745,"recall_at_3":0.53445,"recall_at_5":0.62563,"recall_at_10":0.71389,"recall_at_100":0.88598,"recall_at_1000":0.96569,"precision_at_1":0.549,"precision_at_3":0.3563,"precision_at_5":0.25025,"precision_at_10":0.14278,"precision_at_100":0.01772,"precision_at_1000":0.00193,"mrr_at_1":0.54873,"mrr_at_3":0.63218,"mrr_at_5":0.64433,"mrr_at_10":0.65131,"mrr_at_100":0.65544,"mrr_at_1000":0.65564}},"main_score":"ndcg_at_10"}