Skip to content

Commit

Permalink
Merge pull request #50 from KennethEnevoldsen/run-using-cache
Browse files Browse the repository at this point in the history
Add public cache to benchmark
  • Loading branch information
KennethEnevoldsen authored Jan 14, 2024
2 parents 6d9d7c1 + b7c3012 commit 67e571c
Show file tree
Hide file tree
Showing 415 changed files with 908 additions and 29 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/check_benchmark_is_up_to_date.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Benchmark is up to date
on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
bench-is-up-to-date:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: "pip"

- name: Install dependencies
shell: bash
run: |
make install
- name: Check benchmark is up to date
shell: bash
run: |
make check-benchmark-is-up-to-date
18 changes: 12 additions & 6 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ pr:
make test
@echo "Ready to make a PR"

update-table-in-docs:
@echo "--- 🔄 Updating table in docs ---"
python src/scripts/create_desc_stats.py

build-docs:
@echo "--- 📚 Building docs ---"
@echo "Builds the docs and puts them in the 'site' folder"
@echo "You might need to also update the table with the desc. stats you can do this by running 'make update-table-in-docs'"
mkdocs build

view-docs:
Expand All @@ -37,15 +42,16 @@ update-from-template:
cruft update --skip-apply-ask

update-benchmark:
datawrapper_api_key=$(cat datawrapper_api_key.txt)
python docs/run_benchmark.py --data-wrapper-api-token $datawrapper_api_key

update-benchmark-on-ucloud:
# set environment variables
hf_api_key=$(cat hf_api_key.txt)
export HF_TOKEN=hf_api_key
export SEB_CACHE_DIR=./seb_cache

# run benchmark
datawrapper_api_key=$(cat datawrapper_api_key.txt)
python docs/run_benchmark.py --data-wrapper-api-token $datawrapper_api_key
python docs/run_benchmark.py --data-wrapper-api-token $datawrapper_api_key


check-benchmark-is-up-to-date:
@echo "--- 🔄 Checking benchmark is up to date ---"

python src/scripts/check_benchmark_is_up_to_date.py
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ requires-python = ">=3.9"
dependencies = [
"tabulate>=0.9.0",
"mteb[beir]==1.1.1",
"typer>=0.7.0",
"pydantic>=2.1.0",
"catalogue>=2.0.8",

]

[project.license]
Expand Down
9 changes: 9 additions & 0 deletions src/scripts/check_benchmark_is_up_to_date.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from seb import run_benchmark


def main():
run_benchmark(use_cache=True, run_models=False, raise_errors=True)


if __name__ == "__main__":
main()
48 changes: 43 additions & 5 deletions src/seb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
logger = logging.getLogger(__name__)


def get_cache_path(task: Task, model: SebModel) -> Path:
def get_cache_path(
task: Task, model: SebModel, cache_dir: Optional[Path] = None
) -> Path:
"""
Get the cache path for a task and model.
"""
cache_path = get_cache_dir()
cache_path = cache_dir if cache_dir is not None else get_cache_dir()
mdl_path_name = model.meta.get_path_name()
task_path_name = name_to_path(task.name) + ".json"
task_cache_path = cache_path / mdl_path_name / task_path_name
Expand All @@ -29,15 +31,28 @@ def run_task(
task: Task,
model: SebModel,
use_cache: bool,
run_model: bool,
raise_errors: bool,
cache_dir: Optional[Path] = None,
) -> Union[TaskResult, TaskError]:
"""
Tests a model on a task
"""
if run_model is False and use_cache is False:
raise ValueError("run_model and use_cache cannot both be False")
if not raise_errors and run_model is False:
raise ValueError("raise_errors cannot be False when run_model is False")

if not raise_errors:
try:
return run_task(task, model, use_cache, raise_errors=True)
return run_task(
task=task,
model=model,
use_cache=use_cache,
run_model=run_model,
raise_errors=True,
cache_dir=cache_dir,
)
except Exception as e:
logger.error(f"Error when running {task.name} on {model.meta.name}: {e}")
return TaskError(
Expand All @@ -46,13 +61,19 @@ def run_task(
time_of_run=datetime.now(),
)

cache_path = get_cache_path(task, model)
cache_path = get_cache_path(task, model, cache_dir)
if cache_path.exists() and use_cache:
logger.info(f"Loading cached result for {model.meta.name} on {task.name}")
task_result = TaskResult.from_disk(cache_path)
return task_result

cache_path.parent.mkdir(parents=True, exist_ok=True)

if not run_model:
raise ValueError(
f"Cache for {model.meta.name} on {task.name} does not exist. "
"Set run_model=True to run the model.",
)
with WarningIgnoreContextManager():
task_result = task.evaluate(model)
task_result.to_disk(cache_path)
Expand Down Expand Up @@ -104,15 +125,19 @@ def evaluate_model(
self,
model: SebModel,
use_cache: bool = True,
run_model: bool = True,
raise_errors: bool = True,
cache_dir: Optional[Path] = None,
) -> BenchmarkResults:
"""
Evaluate a model on the benchmark.
Args:
model: The model to evaluate.
use_cache: Whether to use the cache.
run_model: Whether to run the model if the cache is not present.
raise_errors: Whether to raise errors.
cache_dir: The cache directory to use. If None, the default cache directory is used.
Returns:
The results of the benchmark.
Expand All @@ -121,7 +146,14 @@ def evaluate_model(
task_results = []
pbar = tqdm(tasks, position=1, desc=f"Running {model.meta.name}", leave=False)
for task in pbar:
task_result = run_task(task, model, use_cache, raise_errors)
task_result = run_task(
task,
model,
use_cache=use_cache,
run_model=run_model,
raise_errors=raise_errors,
cache_dir=cache_dir,
)
task_results.append(task_result)

return BenchmarkResults(meta=model.meta, task_results=task_results)
Expand All @@ -130,15 +162,19 @@ def evaluate_models(
self,
models: list[SebModel],
use_cache: bool = True,
run_model: bool = True,
raise_errors: bool = True,
cache_dir: Optional[Path] = None,
) -> list[BenchmarkResults]:
"""
Evaluate a list of models on the benchmark.
Args:
models: The models to evaluate.
use_cache: Whether to use the cache.
run_model: Whether to run the model if the cache is not present.
raise_errors: Whether to raise errors.
cache_dir: The cache directory to use. If None, the default cache directory is used.
Returns:
The results of the benchmark, once for each model.
Expand All @@ -151,7 +187,9 @@ def evaluate_models(
self.evaluate_model(
model,
use_cache=use_cache,
run_model=run_model,
raise_errors=raise_errors,
cache_dir=cache_dir,
),
)
return results
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Angry Tweets","task_description":"A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:07.906466","scores":{"da":{"accuracy":0.44460362941738296,"f1":0.4380942035064149,"accuracy_stderr":0.02809792891547516,"f1_stderr":0.02869393997039908,"main_score":0.44460362941738296}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Bornholm Parallel","task_description":"Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:39.189398","scores":{"da":{"precision":0.12769630591630593,"recall":0.188,"f1":0.1408286249981902,"accuracy":0.188,"main_score":0.1408286249981902},"da-bornholm":{"precision":0.12769630591630593,"recall":0.188,"f1":0.1408286249981902,"accuracy":0.188,"main_score":0.1408286249981902}},"main_score":"f1"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.0.3.dev0","time_of_run":"2023-07-30T13:55:26.414673","scores":{"da":{"accuracy":0.5936170212765958,"f1":0.48062030395159827,"ap":0.8897617189814045,"accuracy_stderr":0.09420580255043687,"f1_stderr":0.05292797998084632,"ap_stderr":0.006783234279065605,"main_score":0.5936170212765958}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DaLAJ","task_description":"A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:32:34.602207","scores":{"sv":{"accuracy":0.5011261261261262,"f1":0.4981211967409228,"ap":0.5005828726112415,"accuracy_stderr":0.004419238033862288,"f1_stderr":0.0035063858678943075,"ap_stderr":0.0022274185349420317,"main_score":0.5011261261261262}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Da Political Comments","task_description":"A dataset of Danish political comments rated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:17:00.221073","scores":{"da":{"accuracy":0.28546059933407325,"f1":0.2577317269849485,"accuracy_stderr":0.023480148626138817,"f1_stderr":0.01681654012226201,"main_score":0.28546059933407325}},"main_score":"accuracy"}
1 change: 1 addition & 0 deletions src/seb/cache/KBLab__sentence-bert-swedish-cased/LCC.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:26.284602","scores":{"da":{"accuracy":0.4720000000000001,"f1":0.4564433994886203,"accuracy_stderr":0.03512517299285198,"f1_stderr":0.029213561687871915,"main_score":0.4720000000000001}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Language Identification","task_description":"A dataset for Nordic language identification.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:26:29.341469","scores":{"da":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"sv":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"nb":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"nn":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"is":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"fo":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Massive Intent","task_description":"MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:20:45.850492","scores":{"da":{"accuracy":0.42844653665097515,"f1":0.42183077398989566,"accuracy_stderr":0.009531652284439484,"f1_stderr":0.011720697125388892,"main_score":0.42844653665097515},"nb":{"accuracy":0.42737054472091457,"f1":0.4065771240847707,"accuracy_stderr":0.012555140439491758,"f1_stderr":0.011758439799234426,"main_score":0.42737054472091457},"sv":{"accuracy":0.6910894418291863,"f1":0.6651160698998817,"accuracy_stderr":0.013941949333232515,"f1_stderr":0.012610378122239658,"main_score":0.6910894418291863}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Massive Scenario","task_description":"MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:22:36.232225","scores":{"da":{"accuracy":0.4964021519838601,"f1":0.48223918173555036,"accuracy_stderr":0.013060130811491842,"f1_stderr":0.010102691330202264,"main_score":0.4964021519838601},"nb":{"accuracy":0.4948890383322125,"f1":0.4762622480394999,"accuracy_stderr":0.012910151752994623,"f1_stderr":0.013032592130554148,"main_score":0.4948890383322125},"sv":{"accuracy":0.7595830531271015,"f1":0.7530102836662811,"accuracy_stderr":0.01613987895090787,"f1_stderr":0.013497075103297649,"main_score":0.7595830531271015}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"NoReC","task_description":"A Norwegian dataset for sentiment classification on review","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:27:09.39258","scores":{"nb":{"accuracy":0.43525390625,"f1":0.4148123251467906,"accuracy_stderr":0.01785160523791383,"f1_stderr":0.016923236820285816,"main_score":0.43525390625}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Norwegian parliament","task_description":"Norwegian parliament speeches annotated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:30:21.169416","scores":{"nb":{"accuracy":0.5574166666666668,"f1":0.5530107857827613,"ap":0.5325918715216138,"accuracy_stderr":0.020243140127515356,"f1_stderr":0.02220607579055956,"ap_stderr":0.012785093237452189,"main_score":0.5574166666666668}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"ScaLA","task_description":"A linguistic acceptability task for Danish, Norwegian Bokmål Norwegian Nynorsk and Swedish.","task_version":"1.1.1.dev0","time_of_run":"2023-09-13T15:34:22.754407","scores":{"da":{"accuracy":0.501220703125,"f1":0.4994319866681282,"ap":0.5006382834998069,"accuracy_stderr":0.005056717572560542,"f1_stderr":0.004478704690331606,"ap_stderr":0.0025455551645878903,"main_score":0.501220703125},"nb":{"accuracy":0.50341796875,"f1":0.49599382125168273,"ap":0.5017502977252766,"accuracy_stderr":0.004784159653873394,"f1_stderr":0.008067901398966753,"ap_stderr":0.002431992168215134,"main_score":0.50341796875},"sv":{"accuracy":0.4984375,"f1":0.4943650186171043,"ap":0.4992456314269318,"accuracy_stderr":0.005148068497303595,"f1_stderr":0.006846332722567433,"ap_stderr":0.002557397210459417,"main_score":0.4984375},"nn":{"accuracy":0.50009765625,"f1":0.4977961797199092,"ap":0.5000778352453731,"accuracy_stderr":0.005493109808759856,"f1_stderr":0.00640345588489854,"ap_stderr":0.0027285475508133294,"main_score":0.50009765625}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SweFAQ","task_description":"A Swedish QA dataset derived from FAQ","task_version":"0.0.1","time_of_run":"2023-09-04T19:24:19.566584","scores":{"sv":{"ndcg_at_1":0.56725,"ndcg_at_3":0.68335,"ndcg_at_5":0.71238,"ndcg_at_10":0.73295,"ndcg_at_100":0.75403,"ndcg_at_1000":0.75533,"map_at_1":0.56725,"map_at_3":0.65595,"map_at_5":0.67212,"map_at_10":0.68084,"map_at_100":0.68553,"map_at_1000":0.68559,"recall_at_1":0.56725,"recall_at_3":0.76218,"recall_at_5":0.83236,"recall_at_10":0.89474,"recall_at_100":0.99025,"recall_at_1000":1.0,"precision_at_1":0.56725,"precision_at_3":0.25406,"precision_at_5":0.16647,"precision_at_10":0.08947,"precision_at_100":0.0099,"precision_at_1000":0.001,"mrr_at_1":0.56725,"mrr_at_3":0.65595,"mrr_at_5":0.67212,"mrr_at_10":0.68084,"mrr_at_100":0.68553,"mrr_at_1000":0.68559}},"main_score":"ndcg_at_10"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"SweReC","task_description":"A Swedish dataset for sentiment classification on review","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:32:09.364853","scores":{"sv":{"accuracy":0.7140625,"f1":0.6458174750991554,"accuracy_stderr":0.024781608357670588,"f1_stderr":0.0193427061836479,"main_score":0.7140625}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Angry Tweets","task_description":"A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:30:08.745371","scores":{"da":{"accuracy":0.4458452722063037,"f1":0.4353629739721945,"accuracy_stderr":0.028880980134025704,"f1_stderr":0.029111483185951324,"main_score":0.4458452722063037}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Bornholm Parallel","task_description":"Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:30:37.205614","scores":{"da":{"precision":0.0603645764405845,"recall":0.092,"f1":0.06602699130934425,"accuracy":0.092,"main_score":0.06602699130934425},"da-bornholm":{"precision":0.0603645764405845,"recall":0.092,"f1":0.06602699130934425,"accuracy":0.092,"main_score":0.06602699130934425}},"main_score":"f1"}
1 change: 1 addition & 0 deletions src/seb/cache/KB__bert-base-swedish-cased/DKHate.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.0.3.dev0","time_of_run":"2023-07-30T14:01:46.078734","scores":{"da":{"accuracy":0.5553191489361702,"f1":0.45947587346589847,"ap":0.8870741597469347,"accuracy_stderr":0.08659200521947144,"f1_stderr":0.058762937812424136,"ap_stderr":0.011945599407142385,"main_score":0.5553191489361702}},"main_score":"accuracy"}
1 change: 1 addition & 0 deletions src/seb/cache/KB__bert-base-swedish-cased/DaLAJ.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DaLAJ","task_description":"A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.","task_version":"1.0.3.dev0","time_of_run":"2023-07-28T11:03:11.954025","scores":{"sv":{"accuracy":0.5176801801801801,"f1":0.5152889708732695,"ap":0.5101123206773084,"accuracy_stderr":0.028431283803400806,"f1_stderr":0.029514988893198382,"ap_stderr":0.015255890577832387,"main_score":0.5176801801801801}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Da Political Comments","task_description":"A dataset of Danish political comments rated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:31:58.639984","scores":{"da":{"accuracy":0.28546059933407325,"f1":0.2514372540209549,"accuracy_stderr":0.025401660170899606,"f1_stderr":0.01728671199739282,"main_score":0.28546059933407325}},"main_score":"accuracy"}
1 change: 1 addition & 0 deletions src/seb/cache/KB__bert-base-swedish-cased/LCC.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:30:25.154348","scores":{"da":{"accuracy":0.41200000000000003,"f1":0.3942888799816736,"accuracy_stderr":0.06469071717711042,"f1_stderr":0.05673115864109014,"main_score":0.41200000000000003}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"Language Identification","task_description":"A dataset for Nordic language identification.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:41:03.343376","scores":{"da":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"sv":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"nb":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"nn":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"is":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"fo":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245}},"main_score":"accuracy"}
Loading

0 comments on commit 67e571c

Please sign in to comment.