Skip to content

Commit

Permalink
Merge pull request #110 from KennethEnevoldsen/sts-retrieval
Browse files Browse the repository at this point in the history
Add SNL retrieval
  • Loading branch information
x-tabdeveloping authored Jan 29, 2024
2 parents a32dea2 + a1216bf commit 9412d71
Show file tree
Hide file tree
Showing 13 changed files with 283 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"NorQuad","task_description":"Human-created question for Norwegian wikipedia passages.","task_version":"0.0.1","time_of_run":"2024-01-28T16:44:43.096933","scores":{"nb":{"ndcg_at_1":0.16895,"ndcg_at_3":0.13891,"ndcg_at_5":0.15254,"ndcg_at_10":0.16457,"ndcg_at_100":0.20823,"ndcg_at_1000":0.26722,"map_at_1":0.08447,"map_at_3":0.1071,"map_at_5":0.11418,"map_at_10":0.11869,"map_at_100":0.1259,"map_at_1000":0.12785,"recall_at_1":0.08447,"recall_at_3":0.13428,"recall_at_5":0.16113,"recall_at_10":0.19141,"recall_at_100":0.37354,"recall_at_1000":0.78027,"precision_at_1":0.16895,"precision_at_3":0.08952,"precision_at_5":0.06445,"precision_at_10":0.03828,"precision_at_100":0.00747,"precision_at_1000":0.00156,"mrr_at_1":0.16895,"mrr_at_3":0.20996,"mrr_at_5":0.22061,"mrr_at_10":0.22783,"mrr_at_100":0.2372,"mrr_at_1000":0.23868}},"main_score":"ndcg_at_10"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SNL Retrieval","task_description":"Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.","task_version":"0.0.1","time_of_run":"2024-01-28T14:02:53.208017","scores":{"sv":{"ndcg_at_1":0.41211,"ndcg_at_3":0.47736,"ndcg_at_5":0.49615,"ndcg_at_10":0.51614,"ndcg_at_100":0.55546,"ndcg_at_1000":0.57716,"map_at_1":0.41211,"map_at_3":0.46143,"map_at_5":0.47178,"map_at_10":0.48009,"map_at_100":0.48722,"map_at_1000":0.48795,"recall_at_1":0.41211,"recall_at_3":0.52344,"recall_at_5":0.56934,"recall_at_10":0.63086,"recall_at_100":0.82422,"recall_at_1000":0.99902,"precision_at_1":0.41211,"precision_at_3":0.17448,"precision_at_5":0.11387,"precision_at_10":0.06309,"precision_at_100":0.00824,"precision_at_1000":0.001,"mrr_at_1":0.41211,"mrr_at_3":0.46143,"mrr_at_5":0.47178,"mrr_at_10":0.48009,"mrr_at_100":0.48722,"mrr_at_1000":0.48795}},"main_score":"ndcg_at_10"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"TV2Nord Retrieval","task_description":"News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord.","task_version":"1.1.1","time_of_run":"2024-01-28T16:14:28.36046","scores":{"sv":{"ndcg_at_1":0.2373,"ndcg_at_3":0.29658,"ndcg_at_5":0.31986,"ndcg_at_10":0.3411,"ndcg_at_100":0.39784,"ndcg_at_1000":0.42564,"map_at_1":0.2373,"map_at_3":0.28182,"map_at_5":0.29483,"map_at_10":0.30346,"map_at_100":0.31382,"map_at_1000":0.31477,"recall_at_1":0.2373,"recall_at_3":0.33936,"recall_at_5":0.39551,"recall_at_10":0.46191,"recall_at_100":0.74219,"recall_at_1000":0.9668,"precision_at_1":0.2373,"precision_at_3":0.11312,"precision_at_5":0.0791,"precision_at_10":0.04619,"precision_at_100":0.00742,"precision_at_1000":0.00097,"mrr_at_1":0.2373,"mrr_at_3":0.28182,"mrr_at_5":0.29483,"mrr_at_10":0.30346,"mrr_at_100":0.31382,"mrr_at_1000":0.31477}},"main_score":"ndcg_at_10"}
1 change: 1 addition & 0 deletions src/seb/cache/text-embedding-3-small/NorQuad.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"NorQuad","task_description":"Human-created question for Norwegian wikipedia passages.","task_version":"0.0.1","time_of_run":"2024-01-28T16:44:25.563028","scores":{"nb":{"ndcg_at_1":0.70312,"ndcg_at_3":0.5303,"ndcg_at_5":0.55545,"ndcg_at_10":0.57574,"ndcg_at_100":0.62014,"ndcg_at_1000":0.64568,"map_at_1":0.35156,"map_at_3":0.44523,"map_at_5":0.46318,"map_at_10":0.47436,"map_at_100":0.48588,"map_at_1000":0.48715,"recall_at_1":0.35156,"recall_at_3":0.48877,"recall_at_5":0.53857,"recall_at_10":0.58984,"recall_at_100":0.76904,"recall_at_1000":0.94043,"precision_at_1":0.70312,"precision_at_3":0.32585,"precision_at_5":0.21543,"precision_at_10":0.11797,"precision_at_100":0.01538,"precision_at_1000":0.00188,"mrr_at_1":0.70312,"mrr_at_3":0.76123,"mrr_at_5":0.77085,"mrr_at_10":0.77574,"mrr_at_100":0.7791,"mrr_at_1000":0.77919}},"main_score":"ndcg_at_10"}
1 change: 1 addition & 0 deletions src/seb/cache/text-embedding-3-small/SNL_Retrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SNL Retrieval","task_description":"Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.","task_version":"0.0.1","time_of_run":"2024-01-28T14:02:29.093561","scores":{"sv":{"ndcg_at_1":0.88672,"ndcg_at_3":0.91563,"ndcg_at_5":0.92131,"ndcg_at_10":0.9271,"ndcg_at_100":0.93219,"ndcg_at_1000":0.93347,"map_at_1":0.88672,"map_at_3":0.90902,"map_at_5":0.91219,"map_at_10":0.91465,"map_at_100":0.91562,"map_at_1000":0.91567,"recall_at_1":0.88672,"recall_at_3":0.93457,"recall_at_5":0.94824,"recall_at_10":0.96582,"recall_at_100":0.99023,"recall_at_1000":1.0,"precision_at_1":0.88672,"precision_at_3":0.31152,"precision_at_5":0.18965,"precision_at_10":0.09658,"precision_at_100":0.0099,"precision_at_1000":0.001,"mrr_at_1":0.88672,"mrr_at_3":0.90902,"mrr_at_5":0.91219,"mrr_at_10":0.91465,"mrr_at_100":0.91562,"mrr_at_1000":0.91567}},"main_score":"ndcg_at_10"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"TV2Nord Retrieval","task_description":"News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord.","task_version":"1.1.1","time_of_run":"2024-01-28T16:16:36.156505","scores":{"sv":{"ndcg_at_1":0.86475,"ndcg_at_3":0.90507,"ndcg_at_5":0.91428,"ndcg_at_10":0.92011,"ndcg_at_100":0.9249,"ndcg_at_1000":0.9256,"map_at_1":0.86475,"map_at_3":0.89551,"map_at_5":0.90059,"map_at_10":0.90299,"map_at_100":0.90405,"map_at_1000":0.90408,"recall_at_1":0.86475,"recall_at_3":0.93262,"recall_at_5":0.95508,"recall_at_10":0.97314,"recall_at_100":0.99463,"recall_at_1000":1.0,"precision_at_1":0.86475,"precision_at_3":0.31087,"precision_at_5":0.19102,"precision_at_10":0.09731,"precision_at_100":0.00995,"precision_at_1000":0.001,"mrr_at_1":0.86475,"mrr_at_3":0.89551,"mrr_at_5":0.90059,"mrr_at_10":0.90299,"mrr_at_100":0.90405,"mrr_at_1000":0.90408}},"main_score":"ndcg_at_10"}
6 changes: 4 additions & 2 deletions src/seb/mteb_tasks/retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@
from .swefaq_retrieval import SweFaqRetrieval
from .swedn_retrieval import SwednRetrieval
from .swefaq_retrieval import SweFaqRetrieval
from .twitterhjerne import TwitterHjerneRetrieval

from .snl_retrieval import SNLRetrieval
from .t2nord_retrieval import TV2Nordretrieval
from .norquad import NorQuadRetrieval
from .twitterhjerne import TwitterHjerneRetrieval
83 changes: 83 additions & 0 deletions src/seb/mteb_tasks/retrieval/norquad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
The dataset is quite similar to the SNL dataset (both are wikistyle), however NorQuad actually uses questions, while the other is just headline,
article pairs.
"""

from typing import Any

import datasets
from mteb.abstasks import AbsTaskRetrieval


class NorQuadRetrieval(AbsTaskRetrieval):
@property
def description(self) -> dict[str, Any]:
return {
"name": "NorQuadRetrieval",
"hf_hub_name": "ScandEval/norquad-mini",
"description": "Human-created question for Norwegian wikipedia passages.",
"reference": "https://aclanthology.org/2023.nodalida-1.17/",
"type": "Retrieval",
"category": "p2p",
"eval_splits": ["test"],
"eval_langs": ["nb"],
"main_score": "ndcg_at_10",
"revision": "a47881440ce4b18ef61a99be66dc4badbf5aac6e",
}

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
self.description["hf_hub_name"],
revision=self.description.get("revision"),
) # type: ignore

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""
and transform to a retrieval datset, which have the following attributes
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}
text2id = {}

for split in self.dataset:
ds: datasets.Dataset = self.dataset[split] # type: ignore
ds = ds.shuffle(seed=42)
max_samples = min(1024, len(ds))
ds = ds.select(range(max_samples)) # limit the dataset size to make sure the task does not take too long to run
self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}

question = ds["question"]
context = ds["context"]
answer = [a["text"][0] for a in ds["answers"]]

n = 0
for q, cont, ans in zip(question, context, answer):
self.queries[split][str(n)] = q
q_n = n
n += 1
if cont not in text2id:
text2id[cont] = n
self.corpus[split][str(n)] = {"title": "", "text": cont}
n += 1
if ans not in text2id:
text2id[ans] = n
self.corpus[split][str(n)] = {"title": "", "text": ans}
n += 1

self.relevant_docs[split][str(q_n)] = {str(text2id[ans]): 1, str(text2id[cont]): 1} # only two correct matches
85 changes: 85 additions & 0 deletions src/seb/mteb_tasks/retrieval/snl_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
Experiments:
Using two sample models (text-embedding-3-small, all-MiniLM-L6-v2) we get the following results:
96.07, 64.67
if we then remove the ingress from the corpus we get:
92.71, 51.61
The reason why we might want to remove the ingress is that it almost always start with headline.
As the scores are indeed slightly lower we will ignore the ingress as the task becomes too easy.
"""

from typing import Any

import datasets
from mteb.abstasks import AbsTaskRetrieval


class SNLRetrieval(AbsTaskRetrieval):
@property
def description(self) -> dict[str, Any]:
return {
"name": "SNLClustering",
"hf_hub_name": "navjordj/SNL_summarization",
"description": "Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.",
"reference": "https://huggingface.co/datasets/navjordj/SNL_summarization",
"type": "Retrieval",
"category": "p2p",
"eval_splits": ["test"],
"eval_langs": ["nb"],
"main_score": "ndcg_at_10",
"revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1",
}

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
self.description["hf_hub_name"],
revision=self.description.get("revision"),
) # type: ignore

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""
and transform to a retrieval datset, which have the following attributes
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}
text2id = {}

for split in self.dataset:
ds: datasets.Dataset = self.dataset[split] # type: ignore
ds = ds.shuffle(seed=42)

self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}

headline = ds["headline"]
article = ds["article"]

n = 0
for headl, art in zip(headline, article):
self.queries[split][str(n)] = headl
q_n = n
n += 1
if art not in text2id:
text2id[art] = n
self.corpus[split][str(n)] = {"title": "", "text": art}
n += 1
self.relevant_docs[split][str(q_n)] = {str(text2id[art]): 1} # only one correct matches
2 changes: 1 addition & 1 deletion src/seb/mteb_tasks/retrieval/swefaq_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def description(self) -> dict[str, Any]:
"type": "Retrieval",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["sv"],
"eval_langs": ["da"],
"main_score": "ndcg_at_10",
"revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
}
Expand Down
73 changes: 73 additions & 0 deletions src/seb/mteb_tasks/retrieval/t2nord_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Any

import datasets
from mteb.abstasks import AbsTaskRetrieval


class TV2Nordretrieval(AbsTaskRetrieval):
@property
def description(self) -> dict[str, Any]:
return {
"name": "TV2Nordretrieval",
"hf_hub_name": "alexandrainst/nordjylland-news-summarization",
"description": "News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord.",
"reference": "https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization",
"type": "Retrieval",
"category": "p2p",
"eval_splits": ["test"],
"eval_langs": ["da"],
"main_score": "ndcg_at_10",
"revision": "80cdb115ec2ef46d4e926b252f2b59af62d6c070",
}

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
self.description["hf_hub_name"],
revision=self.description.get("revision"),
) # type: ignore

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
"""
and transform to a retrieval datset, which have the following attributes
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""
self.corpus = {}
self.relevant_docs = {}
self.queries = {}
text2id = {}

for split in self.dataset:
ds: datasets.Dataset = self.dataset[split] # type: ignore
ds = ds.shuffle(seed=42)
ds = ds.select(range(2048)) # limit the dataset size to make sure the task does not take too long to run
self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}

summary = ds["summary"]
article = ds["text"]

n = 0
for summ, art in zip(summary, article):
self.queries[split][str(n)] = summ
q_n = n
n += 1
if art not in text2id:
text2id[art] = n
self.corpus[split][str(n)] = {"title": "", "text": art}
n += 1
cor_n = text2id[art]

self.relevant_docs[split][str(q_n)] = {str(text2id[art]): 1} # only one correct matches
9 changes: 9 additions & 0 deletions src/seb/registered_tasks/danish.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ def create_dan_fever() -> Task:
return task


@tasks.register("TV2Nord Retrieval")
def create_tv2nord_retrieval() -> Task:
from seb.mteb_tasks import TV2Nordretrieval

task = MTEBTask(TV2Nordretrieval())
task.name = "TV2Nord Retrieval"
task.domain = ["news", "non-fiction"]
return task

@tasks.register("Twitterhjerne")
def create_twitterhjerne() -> Task:
from seb.mteb_tasks import TwitterHjerneRetrieval
Expand Down
22 changes: 22 additions & 0 deletions src/seb/registered_tasks/norwegian.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,25 @@ def create_sts_clustering() -> Task:
task.version = "0.0.1"
task.domain = ["non-fiction", "wiki"]
return task


@tasks.register("SNL Retrieval")
def create_sts_retrieval() -> Task:
from seb.mteb_tasks import SNLRetrieval

task = MTEBTask(SNLRetrieval())
task.name = "SNL Retrieval"
task.version = "0.0.1"
task.domain = ["non-fiction", "wiki"]
return task


@tasks.register("NorQuad")
def create_norquad() -> Task:
from seb.mteb_tasks import NorQuadRetrieval

task = MTEBTask(NorQuadRetrieval())
task.name = "NorQuad"
task.version = "0.0.1"
task.domain = ["non-fiction", "wiki"]
return task

0 comments on commit 9412d71

Please sign in to comment.