Merge branch 'embeddings-benchmark:main' into main

Lyon-NLP · May 15, 2024 · fbf2fb7 · fbf2fb7
2 parents 3c9748d + 2bc404e
commit fbf2fb7
Show file tree

Hide file tree

Showing 59 changed files with 3,181 additions and 79 deletions.
diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py
@@ -2,8 +2,12 @@
 
 import re
 from pathlib import Path
+from typing import get_args
+
+import polars as pl
 
 import mteb
+from mteb.abstasks.TaskMetadata import PROGRAMMING_LANGS, TASK_TYPE
 
 
 def author_from_bibtex(bibtex: str | None) -> str:
@@ -56,29 +60,76 @@ def create_tasks_table(tasks: list[mteb.AbsTask]) -> str:
     return table
 
 
-def insert_table(file_path, table):
-    """Insert table in the in <!-- TABLE START --> and <!-- TABLE END -->"""
-    with open(file_path, "r") as file:
-        md = file.read()
+def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
+    table_dict = {}
+    ## Group by language. If it is a multilingual dataset, 1 is added to all languages present.
+    for task in tasks:
+        for lang in task.metadata.languages:
+            if lang in PROGRAMMING_LANGS:
+                lang = "code"
+            if table_dict.get(lang) is None:
+                table_dict[lang] = {k: 0 for k in sorted(get_args(TASK_TYPE))}
+            table_dict[lang][task.metadata.type] += 1
+
+    ## Wrangle for polars
+    pl_table_dict = []
+    for lang, d in table_dict.items():
+        d.update({"lang": lang})
+        pl_table_dict.append(d)
+
+    df = pl.DataFrame(pl_table_dict).sort(by="lang")
+    total = df.sum(axis=0)
+
+    task_names_md = " | ".join(sorted(get_args(TASK_TYPE)))
+    horizontal_line_md = "---|---" * len(sorted(get_args(TASK_TYPE)))
+    table = """
+| Language | {} |
+|{}|
+""".format(task_names_md, horizontal_line_md)
+
+    for row in df.iter_rows():
+        table += f"| {row[-1]} "
+        for num in row[:-1]:
+            table += f"| {num} "
+        table += "|\n"
+
+    for row in total.iter_rows():
+        table += "| Total "
+        for num in row[:-1]:
+            table += f"| {num} "
+        table += "|\n"
 
-    start = "<!-- TABLE START -->"
-    end = "<!-- TABLE END -->"
+    return table
 
-    md = md.replace(md[md.index(start) + len(start) : md.index(end)], table)
 
-    with open(file_path, "w") as file:
-        file.write(md)
+def insert_tables(
+    file_path: str, tables: list[str], tags: list[str] = ["TASKS TABLE"]
+) -> None:
+    """Insert tables within <!-- TABLE START --> and <!-- TABLE END --> or similar tags."""
+    md = Path(file_path).read_text()
+
+    for table, tag in zip(tables, tags):
+        start = f"<!-- {tag} START -->"
+        end = f"<!-- {tag} END -->"
+        md = md.replace(md[md.index(start) + len(start) : md.index(end)], table)
+
+    Path(file_path).write_text(md)
 
 
 def main():
     tasks = mteb.get_tasks()
     tasks = sorted(tasks, key=lambda x: x.metadata.name)
 
-    table = create_tasks_table(tasks)
+    tasks_table = create_tasks_table(tasks)
+    task_lang_table = create_task_lang_table(tasks)
 
     file_path = Path(__file__).parent / "tasks.md"
 
-    insert_table(file_path, table)
+    insert_tables(
+        file_path,
+        tables=[tasks_table, task_lang_table],
+        tags=["TASKS TABLE", "TASK LANG TABLE"],
+    )
 
 
 if __name__ == "__main__":

diff --git a/docs/mmteb/points.md b/docs/mmteb/points.md
@@ -80,3 +80,4 @@ Please also add your first name and last name are as you want them to appear in
 | jupyterjazz       | Saba         | Sturua     | saba.sturua@jina.ai              |     ~Saba_Sturua1      | Jina AI   
 | shreeya-dhakal            | Shreeya     | Dhakal     | ssdhakal57@gmail.com      |                      | Individual Contributor                                                   |
 | dipam7 | Dipam | Vasani | dipam44@gmail.com | ~Dipam_Vasani1 | Individual Contributor                                                  |
+| jankounchained    | Jan        | Kostkan    | jan.kostkan@cas.au.dk | ~Jan_Kostkan1        | Aarhus University, Denmark                            |
diff --git a/docs/mmteb/points/650.jsonl b/docs/mmteb/points/650.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "jankounchained", "New dataset": 10}
+{"GitHub": "x-tabdeveloping", "Review PR": 2}
diff --git a/docs/mmteb/points/676.jsonl b/docs/mmteb/points/676.jsonl
@@ -0,0 +1,3 @@
+{"GitHub": "wissam-sib", "New dataset": 6}
+{"GitHub": "kranthigv", "Review PR": 2}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
diff --git a/docs/mmteb/points/680.jsonl b/docs/mmteb/points/680.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "awinml", "New dataset": 20}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
diff --git a/docs/mmteb/points/701.jsonl b/docs/mmteb/points/701.jsonl
@@ -0,0 +1,3 @@
+{"GitHub": "isaac-chung", "Paper writing": 4}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
+{"GitHub": "mrshu", "Review PR": 2}
diff --git a/docs/mmteb/points/721.jsonl b/docs/mmteb/points/721.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "wissam-sib", "New dataset": 8}
+{"GitHub": "imenelydiaker", "Review PR": 2}
diff --git a/docs/mmteb/points/723.jsonl b/docs/mmteb/points/723.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "wissam-sib", "New dataset": 6}
+{"GitHub": "imenelydiaker", "Review PR": 2}
diff --git a/docs/mmteb/points/725.jsonl b/docs/mmteb/points/725.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "wissam-sib", "New dataset": 6}
+{"GitHub": "imenelydiaker", "Review PR": 2}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"GitHub": "jankounchained", "New dataset": 10}
		{"GitHub": "x-tabdeveloping", "Review PR": 2}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"GitHub": "awinml", "New dataset": 20}
		{"GitHub": "KennethEnevoldsen", "Review PR": 2}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"GitHub": "wissam-sib", "New dataset": 8}
		{"GitHub": "imenelydiaker", "Review PR": 2}