Skip to content

Commit

Permalink
Merge pull request #82 from KennethEnevoldsen/add-retrieval-swedn
Browse files Browse the repository at this point in the history
feat: Added SwednRetrieval task
  • Loading branch information
x-tabdeveloping authored Jan 23, 2024
2 parents 91e64c6 + 7fe3371 commit d5f959d
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 7 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-01-22T21:05:10.182137","scores":{"sv":{"ndcg_at_1":0.549,"ndcg_at_3":0.52531,"ndcg_at_5":0.57173,"ndcg_at_10":0.60683,"ndcg_at_100":0.65156,"ndcg_at_1000":0.66375,"map_at_1":0.2745,"map_at_3":0.46698,"map_at_5":0.50327,"map_at_10":0.5235,"map_at_100":0.5362,"map_at_1000":0.5368,"recall_at_1":0.2745,"recall_at_3":0.53445,"recall_at_5":0.62563,"recall_at_10":0.71389,"recall_at_100":0.88598,"recall_at_1000":0.96569,"precision_at_1":0.549,"precision_at_3":0.3563,"precision_at_5":0.25025,"precision_at_10":0.14278,"precision_at_100":0.01772,"precision_at_1000":0.00193,"mrr_at_1":0.54873,"mrr_at_3":0.63218,"mrr_at_5":0.64433,"mrr_at_10":0.65131,"mrr_at_100":0.65544,"mrr_at_1000":0.65564}},"main_score":"ndcg_at_10"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-01-22T21:13:25.659702","scores":{"sv":{"ndcg_at_1":0.5223,"ndcg_at_3":0.50655,"ndcg_at_5":0.54827,"ndcg_at_10":0.58265,"ndcg_at_100":0.62707,"ndcg_at_1000":0.64137,"map_at_1":0.26115,"map_at_3":0.45145,"map_at_5":0.48354,"map_at_10":0.50308,"map_at_100":0.51484,"map_at_1000":0.51554,"recall_at_1":0.26115,"recall_at_3":0.51682,"recall_at_5":0.59907,"recall_at_10":0.68478,"recall_at_100":0.85808,"recall_at_1000":0.9518,"precision_at_1":0.5223,"precision_at_3":0.34455,"precision_at_5":0.23963,"precision_at_10":0.13696,"precision_at_100":0.01716,"precision_at_1000":0.0019,"mrr_at_1":0.5223,"mrr_at_3":0.60516,"mrr_at_5":0.61691,"mrr_at_10":0.62487,"mrr_at_100":0.62973,"mrr_at_1000":0.62996}},"main_score":"ndcg_at_10"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SwednRetrieval","task_description":"News Article Summary Semantic Similarity Estimation.","task_version":"0.0.1","time_of_run":"2024-01-22T20:47:16.683158","scores":{"sv":{"ndcg_at_1":0.18425,"ndcg_at_3":0.15957,"ndcg_at_5":0.17453,"ndcg_at_10":0.19174,"ndcg_at_100":0.23454,"ndcg_at_1000":0.27441,"map_at_1":0.09212,"map_at_3":0.12915,"map_at_5":0.13795,"map_at_10":0.14589,"map_at_100":0.15436,"map_at_1000":0.15585,"recall_at_1":0.09212,"recall_at_3":0.15714,"recall_at_5":0.18678,"recall_at_10":0.23004,"recall_at_100":0.40427,"recall_at_1000":0.6741,"precision_at_1":0.18425,"precision_at_3":0.10476,"precision_at_5":0.07471,"precision_at_10":0.04601,"precision_at_100":0.00809,"precision_at_1000":0.00135,"mrr_at_1":0.18398,"mrr_at_3":0.22474,"mrr_at_5":0.2342,"mrr_at_10":0.24161,"mrr_at_100":0.24946,"mrr_at_1000":0.25047}},"main_score":"ndcg_at_10"}
4 changes: 2 additions & 2 deletions src/seb/registered_models/e5_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ def encode(
self,
sentences: list[str],
*,
task: Task, # noqa: ARG002
task: Task,
batch_size: int = 32,
**kwargs: Any,
) -> ArrayLike:
sentences = self.preprocess(sentences)
return self.mdl.encode(sentences, batch_size=batch_size, **kwargs) # type: ignore
return self.mdl.encode(sentences, batch_size=batch_size, task=task, **kwargs) # type: ignore


# English
Expand Down
79 changes: 77 additions & 2 deletions src/seb/registered_tasks/mteb_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,11 @@ def description(self) -> dict[str, Any]:
"hf_hub_name": "sbx/superlim-2",
"description": "News Article Summary Semantic Similarity Estimation.",
"reference": "https://spraakbanken.gu.se/en/resources/swedn",
"type": "STS",
"type": "Retrieval",
"category": "p2p",
"eval_splits": ["test"],
"eval_langs": ["sv"],
"main_score": "spearman",
# All pairs are considered to be semantically similar (score=1)
"min_score": 0,
"max_score": 1,
"revision": "ef1661775d746e0844b299164773db733bdc0bf6",
Expand All @@ -179,6 +178,81 @@ def sattolo_cycle(items: list[T]) -> list[T]:
return items


class SwednRetrieval(AbsTaskRetrieval):
@property
def description(self) -> dict[str, Any]:
return {
"name": "Swedn",
"hf_hub_name": "sbx/superlim-2",
"description": "News Article Summary Semantic Similarity Estimation.",
"reference": "https://spraakbanken.gu.se/en/resources/swedn",
"type": "STS",
"category": "p2p",
"eval_splits": ["test"],
"eval_langs": ["sv"],
"main_score": "ndcg_at_10",
"revision": "ef1661775d746e0844b299164773db733bdc0bf6",
}

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
self.description["hf_hub_name"],
"swedn", # chose the relevant subset
revision=self.description.get("revision"),
) # type: ignore

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""
and transform to a retrieval datset, which have the following attributes
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}
text2id = {}

for split in self.dataset:
ds: datasets.Dataset = self.dataset[split] # type: ignore
self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}

headline = ds["headline"]
summary = ds["summary"]
article = ds["article"]

n = 0
for headl, summ, art in zip(headline, summary, article):
self.queries[split][str(n)] = headl
q_n = n
n += 1
if summ not in text2id:
text2id[summ] = n
self.corpus[split][str(n)] = {"title": "", "text": summ}
n += 1
if art not in text2id:
text2id[art] = n
self.corpus[split][str(n)] = {"title": "", "text": art}
n += 1
cor_n = text2id[art]

self.relevant_docs[split][str(q_n)] = {
str(text2id[art]): 1, str(text2id[summ]): 1
} # only two correct matches


class NorwegianCourtsBitextMining(AbsTaskBitextMining):
@property
def description(self) -> dict[str, Any]:
Expand Down Expand Up @@ -216,3 +290,4 @@ def dataset_transform(self) -> None:
# Convert to standard format
self.dataset = self.dataset.rename_column("nb", "sentence1")
self.dataset = self.dataset.rename_column("nn", "sentence2")

17 changes: 14 additions & 3 deletions src/seb/registered_tasks/swedish.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,23 @@ def create_swefaq() -> Task:


# temporarily disabled - will be added back in the future (along with the new datasets)
# @tasks.register("Swedn")
def create_swedn() -> Task:
# @tasks.register("SwednSTS")
def create_swedn_sts() -> Task:
from seb.registered_tasks.mteb_tasks import SwednSummarizationSTS

task = MTEBTask(SwednSummarizationSTS())
task.name = "Swedn"
task.name = "SwednSTS"
task.version = "0.0.1"
task.domain = ["non-fiction", "news"]
return task


# @tasks.register("SwednRetrieval")
def create_swedn_retrieval() -> Task:
from seb.registered_tasks.mteb_tasks import SwednRetrieval

task = MTEBTask(SwednRetrieval())
task.name = "SwednRetrieval"
task.version = "0.0.1"
task.domain = ["non-fiction", "news"]
return task

0 comments on commit d5f959d

Please sign in to comment.