-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #110 from KennethEnevoldsen/sts-retrieval
Add SNL retrieval
- Loading branch information
Showing
13 changed files
with
283 additions
and
3 deletions.
There are no files selected for viewing
1 change: 1 addition & 0 deletions
1
src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/NorQuad.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"NorQuad","task_description":"Human-created question for Norwegian wikipedia passages.","task_version":"0.0.1","time_of_run":"2024-01-28T16:44:43.096933","scores":{"nb":{"ndcg_at_1":0.16895,"ndcg_at_3":0.13891,"ndcg_at_5":0.15254,"ndcg_at_10":0.16457,"ndcg_at_100":0.20823,"ndcg_at_1000":0.26722,"map_at_1":0.08447,"map_at_3":0.1071,"map_at_5":0.11418,"map_at_10":0.11869,"map_at_100":0.1259,"map_at_1000":0.12785,"recall_at_1":0.08447,"recall_at_3":0.13428,"recall_at_5":0.16113,"recall_at_10":0.19141,"recall_at_100":0.37354,"recall_at_1000":0.78027,"precision_at_1":0.16895,"precision_at_3":0.08952,"precision_at_5":0.06445,"precision_at_10":0.03828,"precision_at_100":0.00747,"precision_at_1000":0.00156,"mrr_at_1":0.16895,"mrr_at_3":0.20996,"mrr_at_5":0.22061,"mrr_at_10":0.22783,"mrr_at_100":0.2372,"mrr_at_1000":0.23868}},"main_score":"ndcg_at_10"} |
1 change: 1 addition & 0 deletions
1
src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/SNL_Retrieval.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"SNL Retrieval","task_description":"Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.","task_version":"0.0.1","time_of_run":"2024-01-28T14:02:53.208017","scores":{"sv":{"ndcg_at_1":0.41211,"ndcg_at_3":0.47736,"ndcg_at_5":0.49615,"ndcg_at_10":0.51614,"ndcg_at_100":0.55546,"ndcg_at_1000":0.57716,"map_at_1":0.41211,"map_at_3":0.46143,"map_at_5":0.47178,"map_at_10":0.48009,"map_at_100":0.48722,"map_at_1000":0.48795,"recall_at_1":0.41211,"recall_at_3":0.52344,"recall_at_5":0.56934,"recall_at_10":0.63086,"recall_at_100":0.82422,"recall_at_1000":0.99902,"precision_at_1":0.41211,"precision_at_3":0.17448,"precision_at_5":0.11387,"precision_at_10":0.06309,"precision_at_100":0.00824,"precision_at_1000":0.001,"mrr_at_1":0.41211,"mrr_at_3":0.46143,"mrr_at_5":0.47178,"mrr_at_10":0.48009,"mrr_at_100":0.48722,"mrr_at_1000":0.48795}},"main_score":"ndcg_at_10"} |
1 change: 1 addition & 0 deletions
1
src/seb/cache/sentence-transformers__all-MiniLM-L6-v2/TV2Nord_Retrieval.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"TV2Nord Retrieval","task_description":"News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord.","task_version":"1.1.1","time_of_run":"2024-01-28T16:14:28.36046","scores":{"sv":{"ndcg_at_1":0.2373,"ndcg_at_3":0.29658,"ndcg_at_5":0.31986,"ndcg_at_10":0.3411,"ndcg_at_100":0.39784,"ndcg_at_1000":0.42564,"map_at_1":0.2373,"map_at_3":0.28182,"map_at_5":0.29483,"map_at_10":0.30346,"map_at_100":0.31382,"map_at_1000":0.31477,"recall_at_1":0.2373,"recall_at_3":0.33936,"recall_at_5":0.39551,"recall_at_10":0.46191,"recall_at_100":0.74219,"recall_at_1000":0.9668,"precision_at_1":0.2373,"precision_at_3":0.11312,"precision_at_5":0.0791,"precision_at_10":0.04619,"precision_at_100":0.00742,"precision_at_1000":0.00097,"mrr_at_1":0.2373,"mrr_at_3":0.28182,"mrr_at_5":0.29483,"mrr_at_10":0.30346,"mrr_at_100":0.31382,"mrr_at_1000":0.31477}},"main_score":"ndcg_at_10"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"NorQuad","task_description":"Human-created question for Norwegian wikipedia passages.","task_version":"0.0.1","time_of_run":"2024-01-28T16:44:25.563028","scores":{"nb":{"ndcg_at_1":0.70312,"ndcg_at_3":0.5303,"ndcg_at_5":0.55545,"ndcg_at_10":0.57574,"ndcg_at_100":0.62014,"ndcg_at_1000":0.64568,"map_at_1":0.35156,"map_at_3":0.44523,"map_at_5":0.46318,"map_at_10":0.47436,"map_at_100":0.48588,"map_at_1000":0.48715,"recall_at_1":0.35156,"recall_at_3":0.48877,"recall_at_5":0.53857,"recall_at_10":0.58984,"recall_at_100":0.76904,"recall_at_1000":0.94043,"precision_at_1":0.70312,"precision_at_3":0.32585,"precision_at_5":0.21543,"precision_at_10":0.11797,"precision_at_100":0.01538,"precision_at_1000":0.00188,"mrr_at_1":0.70312,"mrr_at_3":0.76123,"mrr_at_5":0.77085,"mrr_at_10":0.77574,"mrr_at_100":0.7791,"mrr_at_1000":0.77919}},"main_score":"ndcg_at_10"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"SNL Retrieval","task_description":"Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.","task_version":"0.0.1","time_of_run":"2024-01-28T14:02:29.093561","scores":{"sv":{"ndcg_at_1":0.88672,"ndcg_at_3":0.91563,"ndcg_at_5":0.92131,"ndcg_at_10":0.9271,"ndcg_at_100":0.93219,"ndcg_at_1000":0.93347,"map_at_1":0.88672,"map_at_3":0.90902,"map_at_5":0.91219,"map_at_10":0.91465,"map_at_100":0.91562,"map_at_1000":0.91567,"recall_at_1":0.88672,"recall_at_3":0.93457,"recall_at_5":0.94824,"recall_at_10":0.96582,"recall_at_100":0.99023,"recall_at_1000":1.0,"precision_at_1":0.88672,"precision_at_3":0.31152,"precision_at_5":0.18965,"precision_at_10":0.09658,"precision_at_100":0.0099,"precision_at_1000":0.001,"mrr_at_1":0.88672,"mrr_at_3":0.90902,"mrr_at_5":0.91219,"mrr_at_10":0.91465,"mrr_at_100":0.91562,"mrr_at_1000":0.91567}},"main_score":"ndcg_at_10"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"task_name":"TV2Nord Retrieval","task_description":"News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord.","task_version":"1.1.1","time_of_run":"2024-01-28T16:16:36.156505","scores":{"sv":{"ndcg_at_1":0.86475,"ndcg_at_3":0.90507,"ndcg_at_5":0.91428,"ndcg_at_10":0.92011,"ndcg_at_100":0.9249,"ndcg_at_1000":0.9256,"map_at_1":0.86475,"map_at_3":0.89551,"map_at_5":0.90059,"map_at_10":0.90299,"map_at_100":0.90405,"map_at_1000":0.90408,"recall_at_1":0.86475,"recall_at_3":0.93262,"recall_at_5":0.95508,"recall_at_10":0.97314,"recall_at_100":0.99463,"recall_at_1000":1.0,"precision_at_1":0.86475,"precision_at_3":0.31087,"precision_at_5":0.19102,"precision_at_10":0.09731,"precision_at_100":0.00995,"precision_at_1000":0.001,"mrr_at_1":0.86475,"mrr_at_3":0.89551,"mrr_at_5":0.90059,"mrr_at_10":0.90299,"mrr_at_100":0.90405,"mrr_at_1000":0.90408}},"main_score":"ndcg_at_10"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
""" | ||
The dataset is quite similar to the SNL dataset (both are wikistyle), however NorQuad actually uses questions, while the other is just headline, | ||
article pairs. | ||
""" | ||
|
||
from typing import Any | ||
|
||
import datasets | ||
from mteb.abstasks import AbsTaskRetrieval | ||
|
||
|
||
class NorQuadRetrieval(AbsTaskRetrieval): | ||
@property | ||
def description(self) -> dict[str, Any]: | ||
return { | ||
"name": "NorQuadRetrieval", | ||
"hf_hub_name": "ScandEval/norquad-mini", | ||
"description": "Human-created question for Norwegian wikipedia passages.", | ||
"reference": "https://aclanthology.org/2023.nodalida-1.17/", | ||
"type": "Retrieval", | ||
"category": "p2p", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["nb"], | ||
"main_score": "ndcg_at_10", | ||
"revision": "a47881440ce4b18ef61a99be66dc4badbf5aac6e", | ||
} | ||
|
||
def load_data(self, **kwargs: dict): # noqa: ARG002 | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset: datasets.DatasetDict = datasets.load_dataset( | ||
self.description["hf_hub_name"], | ||
revision=self.description.get("revision"), | ||
) # type: ignore | ||
|
||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self) -> None: | ||
""" | ||
and transform to a retrieval datset, which have the following attributes | ||
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text | ||
self.queries = Dict[query_id, str] #id => query | ||
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] | ||
""" | ||
self.corpus = {} | ||
self.relevant_docs = {} | ||
self.queries = {} | ||
text2id = {} | ||
|
||
for split in self.dataset: | ||
ds: datasets.Dataset = self.dataset[split] # type: ignore | ||
ds = ds.shuffle(seed=42) | ||
max_samples = min(1024, len(ds)) | ||
ds = ds.select(range(max_samples)) # limit the dataset size to make sure the task does not take too long to run | ||
self.queries[split] = {} | ||
self.relevant_docs[split] = {} | ||
self.corpus[split] = {} | ||
|
||
question = ds["question"] | ||
context = ds["context"] | ||
answer = [a["text"][0] for a in ds["answers"]] | ||
|
||
n = 0 | ||
for q, cont, ans in zip(question, context, answer): | ||
self.queries[split][str(n)] = q | ||
q_n = n | ||
n += 1 | ||
if cont not in text2id: | ||
text2id[cont] = n | ||
self.corpus[split][str(n)] = {"title": "", "text": cont} | ||
n += 1 | ||
if ans not in text2id: | ||
text2id[ans] = n | ||
self.corpus[split][str(n)] = {"title": "", "text": ans} | ||
n += 1 | ||
|
||
self.relevant_docs[split][str(q_n)] = {str(text2id[ans]): 1, str(text2id[cont]): 1} # only two correct matches |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
""" | ||
Experiments: | ||
Using two sample models (text-embedding-3-small, all-MiniLM-L6-v2) we get the following results: | ||
96.07, 64.67 | ||
if we then remove the ingress from the corpus we get: | ||
92.71, 51.61 | ||
The reason why we might want to remove the ingress is that it almost always start with headline. | ||
As the scores are indeed slightly lower we will ignore the ingress as the task becomes too easy. | ||
""" | ||
|
||
from typing import Any | ||
|
||
import datasets | ||
from mteb.abstasks import AbsTaskRetrieval | ||
|
||
|
||
class SNLRetrieval(AbsTaskRetrieval): | ||
@property | ||
def description(self) -> dict[str, Any]: | ||
return { | ||
"name": "SNLClustering", | ||
"hf_hub_name": "navjordj/SNL_summarization", | ||
"description": "Webscrabed articles and ingresses from the Norwegian lexicon 'Det Store Norske Leksikon'.", | ||
"reference": "https://huggingface.co/datasets/navjordj/SNL_summarization", | ||
"type": "Retrieval", | ||
"category": "p2p", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["nb"], | ||
"main_score": "ndcg_at_10", | ||
"revision": "3d3d27aa7af8941408cefc3991ada5d12a4273d1", | ||
} | ||
|
||
def load_data(self, **kwargs: dict): # noqa: ARG002 | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset: datasets.DatasetDict = datasets.load_dataset( | ||
self.description["hf_hub_name"], | ||
revision=self.description.get("revision"), | ||
) # type: ignore | ||
|
||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self) -> None: | ||
""" | ||
and transform to a retrieval datset, which have the following attributes | ||
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text | ||
self.queries = Dict[query_id, str] #id => query | ||
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] | ||
""" | ||
self.corpus = {} | ||
self.relevant_docs = {} | ||
self.queries = {} | ||
text2id = {} | ||
|
||
for split in self.dataset: | ||
ds: datasets.Dataset = self.dataset[split] # type: ignore | ||
ds = ds.shuffle(seed=42) | ||
|
||
self.queries[split] = {} | ||
self.relevant_docs[split] = {} | ||
self.corpus[split] = {} | ||
|
||
headline = ds["headline"] | ||
article = ds["article"] | ||
|
||
n = 0 | ||
for headl, art in zip(headline, article): | ||
self.queries[split][str(n)] = headl | ||
q_n = n | ||
n += 1 | ||
if art not in text2id: | ||
text2id[art] = n | ||
self.corpus[split][str(n)] = {"title": "", "text": art} | ||
n += 1 | ||
self.relevant_docs[split][str(q_n)] = {str(text2id[art]): 1} # only one correct matches |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from typing import Any | ||
|
||
import datasets | ||
from mteb.abstasks import AbsTaskRetrieval | ||
|
||
|
||
class TV2Nordretrieval(AbsTaskRetrieval): | ||
@property | ||
def description(self) -> dict[str, Any]: | ||
return { | ||
"name": "TV2Nordretrieval", | ||
"hf_hub_name": "alexandrainst/nordjylland-news-summarization", | ||
"description": "News Article and corresponding summaries extracted from the Danish newspaper TV2 Nord.", | ||
"reference": "https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization", | ||
"type": "Retrieval", | ||
"category": "p2p", | ||
"eval_splits": ["test"], | ||
"eval_langs": ["da"], | ||
"main_score": "ndcg_at_10", | ||
"revision": "80cdb115ec2ef46d4e926b252f2b59af62d6c070", | ||
} | ||
|
||
def load_data(self, **kwargs: dict): # noqa: ARG002 | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset: datasets.DatasetDict = datasets.load_dataset( | ||
self.description["hf_hub_name"], | ||
revision=self.description.get("revision"), | ||
) # type: ignore | ||
|
||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self) -> None: | ||
""" | ||
and transform to a retrieval datset, which have the following attributes | ||
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text | ||
self.queries = Dict[query_id, str] #id => query | ||
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] | ||
""" | ||
self.corpus = {} | ||
self.relevant_docs = {} | ||
self.queries = {} | ||
text2id = {} | ||
|
||
for split in self.dataset: | ||
ds: datasets.Dataset = self.dataset[split] # type: ignore | ||
ds = ds.shuffle(seed=42) | ||
ds = ds.select(range(2048)) # limit the dataset size to make sure the task does not take too long to run | ||
self.queries[split] = {} | ||
self.relevant_docs[split] = {} | ||
self.corpus[split] = {} | ||
|
||
summary = ds["summary"] | ||
article = ds["text"] | ||
|
||
n = 0 | ||
for summ, art in zip(summary, article): | ||
self.queries[split][str(n)] = summ | ||
q_n = n | ||
n += 1 | ||
if art not in text2id: | ||
text2id[art] = n | ||
self.corpus[split][str(n)] = {"title": "", "text": art} | ||
n += 1 | ||
cor_n = text2id[art] | ||
|
||
self.relevant_docs[split][str(q_n)] = {str(text2id[art]): 1} # only one correct matches |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters