Skip to content

Commit

Permalink
Merge branch 'embeddings-benchmark:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
imenelydiaker authored May 15, 2024
2 parents 3c9748d + 2bc404e commit fbf2fb7
Show file tree
Hide file tree
Showing 59 changed files with 3,181 additions and 79 deletions.
73 changes: 62 additions & 11 deletions docs/create_tasks_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

import re
from pathlib import Path
from typing import get_args

import polars as pl

import mteb
from mteb.abstasks.TaskMetadata import PROGRAMMING_LANGS, TASK_TYPE


def author_from_bibtex(bibtex: str | None) -> str:
Expand Down Expand Up @@ -56,29 +60,76 @@ def create_tasks_table(tasks: list[mteb.AbsTask]) -> str:
return table


def insert_table(file_path, table):
"""Insert table in the in <!-- TABLE START --> and <!-- TABLE END -->"""
with open(file_path, "r") as file:
md = file.read()
def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str:
table_dict = {}
## Group by language. If it is a multilingual dataset, 1 is added to all languages present.
for task in tasks:
for lang in task.metadata.languages:
if lang in PROGRAMMING_LANGS:
lang = "code"
if table_dict.get(lang) is None:
table_dict[lang] = {k: 0 for k in sorted(get_args(TASK_TYPE))}
table_dict[lang][task.metadata.type] += 1

## Wrangle for polars
pl_table_dict = []
for lang, d in table_dict.items():
d.update({"lang": lang})
pl_table_dict.append(d)

df = pl.DataFrame(pl_table_dict).sort(by="lang")
total = df.sum(axis=0)

task_names_md = " | ".join(sorted(get_args(TASK_TYPE)))
horizontal_line_md = "---|---" * len(sorted(get_args(TASK_TYPE)))
table = """
| Language | {} |
|{}|
""".format(task_names_md, horizontal_line_md)

for row in df.iter_rows():
table += f"| {row[-1]} "
for num in row[:-1]:
table += f"| {num} "
table += "|\n"

for row in total.iter_rows():
table += "| Total "
for num in row[:-1]:
table += f"| {num} "
table += "|\n"

start = "<!-- TABLE START -->"
end = "<!-- TABLE END -->"
return table

md = md.replace(md[md.index(start) + len(start) : md.index(end)], table)

with open(file_path, "w") as file:
file.write(md)
def insert_tables(
file_path: str, tables: list[str], tags: list[str] = ["TASKS TABLE"]
) -> None:
"""Insert tables within <!-- TABLE START --> and <!-- TABLE END --> or similar tags."""
md = Path(file_path).read_text()

for table, tag in zip(tables, tags):
start = f"<!-- {tag} START -->"
end = f"<!-- {tag} END -->"
md = md.replace(md[md.index(start) + len(start) : md.index(end)], table)

Path(file_path).write_text(md)


def main():
tasks = mteb.get_tasks()
tasks = sorted(tasks, key=lambda x: x.metadata.name)

table = create_tasks_table(tasks)
tasks_table = create_tasks_table(tasks)
task_lang_table = create_task_lang_table(tasks)

file_path = Path(__file__).parent / "tasks.md"

insert_table(file_path, table)
insert_tables(
file_path,
tables=[tasks_table, task_lang_table],
tags=["TASKS TABLE", "TASK LANG TABLE"],
)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions docs/mmteb/points.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,4 @@ Please also add your first name and last name are as you want them to appear in
| jupyterjazz | Saba | Sturua | saba.sturua@jina.ai | ~Saba_Sturua1 | Jina AI
| shreeya-dhakal | Shreeya | Dhakal | ssdhakal57@gmail.com | | Individual Contributor |
| dipam7 | Dipam | Vasani | dipam44@gmail.com | ~Dipam_Vasani1 | Individual Contributor |
| jankounchained | Jan | Kostkan | jan.kostkan@cas.au.dk | ~Jan_Kostkan1 | Aarhus University, Denmark |
2 changes: 2 additions & 0 deletions docs/mmteb/points/650.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "jankounchained", "New dataset": 10}
{"GitHub": "x-tabdeveloping", "Review PR": 2}
3 changes: 3 additions & 0 deletions docs/mmteb/points/676.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "wissam-sib", "New dataset": 6}
{"GitHub": "kranthigv", "Review PR": 2}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/680.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "awinml", "New dataset": 20}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
3 changes: 3 additions & 0 deletions docs/mmteb/points/701.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "isaac-chung", "Paper writing": 4}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
{"GitHub": "mrshu", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/721.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "wissam-sib", "New dataset": 8}
{"GitHub": "imenelydiaker", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/723.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "wissam-sib", "New dataset": 6}
{"GitHub": "imenelydiaker", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/725.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "wissam-sib", "New dataset": 6}
{"GitHub": "imenelydiaker", "Review PR": 2}
Loading

0 comments on commit fbf2fb7

Please sign in to comment.