Merge pull request #50 from KennethEnevoldsen/run-using-cache

Add public cache to benchmark
KennethEnevoldsen · Jan 14, 2024 · 67e571c · 67e571c
2 parents 6d9d7c1 + b7c3012
commit 67e571c
Show file tree

Hide file tree

Showing 415 changed files with 908 additions and 29 deletions.
diff --git a/.github/workflows/check_benchmark_is_up_to_date.yml b/.github/workflows/check_benchmark_is_up_to_date.yml
@@ -0,0 +1,28 @@
+name: Benchmark is up to date
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  bench-is-up-to-date:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: "pip"
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          make install
+
+      - name: Check benchmark is up to date
+        shell: bash
+        run: |
+          make check-benchmark-is-up-to-date
diff --git a/makefile b/makefile
@@ -22,9 +22,14 @@ pr:
 	make test
 	@echo "Ready to make a PR"
 
+update-table-in-docs:
+	@echo "--- 🔄 Updating table in docs ---"
+	python src/scripts/create_desc_stats.py
+
 build-docs:
 	@echo "--- 📚 Building docs ---"
 	@echo "Builds the docs and puts them in the 'site' folder"
+	@echo "You might need to also update the table with the desc. stats you can do this by running 'make update-table-in-docs'"
 	mkdocs build
 
 view-docs:
@@ -37,15 +42,16 @@ update-from-template:
 	cruft update --skip-apply-ask
 
 update-benchmark:
-	datawrapper_api_key=$(cat datawrapper_api_key.txt)
-	python docs/run_benchmark.py --data-wrapper-api-token $datawrapper_api_key
-
-update-benchmark-on-ucloud:
 	# set environment variables
 	hf_api_key=$(cat hf_api_key.txt)
 	export HF_TOKEN=hf_api_key
-	export SEB_CACHE_DIR=./seb_cache
 
 	# run benchmark
 	datawrapper_api_key=$(cat datawrapper_api_key.txt)
-	python docs/run_benchmark.py --data-wrapper-api-token $datawrapper_api_key
+	python docs/run_benchmark.py --data-wrapper-api-token $datawrapper_api_key
+
+
+check-benchmark-is-up-to-date:
+	@echo "--- 🔄 Checking benchmark is up to date ---"
+
+	python src/scripts/check_benchmark_is_up_to_date.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,9 +22,9 @@ requires-python = ">=3.9"
 dependencies = [
   "tabulate>=0.9.0",
   "mteb[beir]==1.1.1",
-  "typer>=0.7.0",
   "pydantic>=2.1.0",
   "catalogue>=2.0.8",
+
 ]
 
 [project.license]

diff --git a/src/scripts/check_benchmark_is_up_to_date.py b/src/scripts/check_benchmark_is_up_to_date.py
@@ -0,0 +1,9 @@
+from seb import run_benchmark
+
+
+def main():
+    run_benchmark(use_cache=True, run_models=False, raise_errors=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/seb/benchmark.py b/src/seb/benchmark.py
@@ -14,11 +14,13 @@
 logger = logging.getLogger(__name__)
 
 
-def get_cache_path(task: Task, model: SebModel) -> Path:
+def get_cache_path(
+    task: Task, model: SebModel, cache_dir: Optional[Path] = None
+) -> Path:
     """
     Get the cache path for a task and model.
     """
-    cache_path = get_cache_dir()
+    cache_path = cache_dir if cache_dir is not None else get_cache_dir()
     mdl_path_name = model.meta.get_path_name()
     task_path_name = name_to_path(task.name) + ".json"
     task_cache_path = cache_path / mdl_path_name / task_path_name
@@ -29,15 +31,28 @@ def run_task(
     task: Task,
     model: SebModel,
     use_cache: bool,
+    run_model: bool,
     raise_errors: bool,
+    cache_dir: Optional[Path] = None,
 ) -> Union[TaskResult, TaskError]:
     """
     Tests a model on a task
     """
+    if run_model is False and use_cache is False:
+        raise ValueError("run_model and use_cache cannot both be False")
+    if not raise_errors and run_model is False:
+        raise ValueError("raise_errors cannot be False when run_model is False")
 
     if not raise_errors:
         try:
-            return run_task(task, model, use_cache, raise_errors=True)
+            return run_task(
+                task=task,
+                model=model,
+                use_cache=use_cache,
+                run_model=run_model,
+                raise_errors=True,
+                cache_dir=cache_dir,
+            )
         except Exception as e:
             logger.error(f"Error when running {task.name} on {model.meta.name}: {e}")
             return TaskError(
@@ -46,13 +61,19 @@ def run_task(
                 time_of_run=datetime.now(),
             )
 
-    cache_path = get_cache_path(task, model)
+    cache_path = get_cache_path(task, model, cache_dir)
     if cache_path.exists() and use_cache:
         logger.info(f"Loading cached result for {model.meta.name} on {task.name}")
         task_result = TaskResult.from_disk(cache_path)
         return task_result
 
     cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if not run_model:
+        raise ValueError(
+            f"Cache for {model.meta.name} on {task.name} does not exist. "
+            "Set run_model=True to run the model.",
+        )
     with WarningIgnoreContextManager():
         task_result = task.evaluate(model)
     task_result.to_disk(cache_path)
@@ -104,15 +125,19 @@ def evaluate_model(
         self,
         model: SebModel,
         use_cache: bool = True,
+        run_model: bool = True,
         raise_errors: bool = True,
+        cache_dir: Optional[Path] = None,
     ) -> BenchmarkResults:
         """
         Evaluate a model on the benchmark.
 
         Args:
             model: The model to evaluate.
             use_cache: Whether to use the cache.
+            run_model: Whether to run the model if the cache is not present.
             raise_errors: Whether to raise errors.
+            cache_dir: The cache directory to use. If None, the default cache directory is used.
 
         Returns:
             The results of the benchmark.
@@ -121,7 +146,14 @@ def evaluate_model(
         task_results = []
         pbar = tqdm(tasks, position=1, desc=f"Running {model.meta.name}", leave=False)
         for task in pbar:
-            task_result = run_task(task, model, use_cache, raise_errors)
+            task_result = run_task(
+                task,
+                model,
+                use_cache=use_cache,
+                run_model=run_model,
+                raise_errors=raise_errors,
+                cache_dir=cache_dir,
+            )
             task_results.append(task_result)
 
         return BenchmarkResults(meta=model.meta, task_results=task_results)
@@ -130,15 +162,19 @@ def evaluate_models(
         self,
         models: list[SebModel],
         use_cache: bool = True,
+        run_model: bool = True,
         raise_errors: bool = True,
+        cache_dir: Optional[Path] = None,
     ) -> list[BenchmarkResults]:
         """
         Evaluate a list of models on the benchmark.
 
         Args:
             models: The models to evaluate.
             use_cache: Whether to use the cache.
+            run_model: Whether to run the model if the cache is not present.
             raise_errors: Whether to raise errors.
+            cache_dir: The cache directory to use. If None, the default cache directory is used.
 
         Returns:
             The results of the benchmark, once for each model.
@@ -151,7 +187,9 @@ def evaluate_models(
                 self.evaluate_model(
                     model,
                     use_cache=use_cache,
+                    run_model=run_model,
                     raise_errors=raise_errors,
+                    cache_dir=cache_dir,
                 ),
             )
         return results
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/Angry_Tweets.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/Angry_Tweets.json
@@ -0,0 +1 @@
+{"task_name":"Angry Tweets","task_description":"A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:07.906466","scores":{"da":{"accuracy":0.44460362941738296,"f1":0.4380942035064149,"accuracy_stderr":0.02809792891547516,"f1_stderr":0.02869393997039908,"main_score":0.44460362941738296}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/Bornholm_Parallel.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/Bornholm_Parallel.json
@@ -0,0 +1 @@
+{"task_name":"Bornholm Parallel","task_description":"Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:39.189398","scores":{"da":{"precision":0.12769630591630593,"recall":0.188,"f1":0.1408286249981902,"accuracy":0.188,"main_score":0.1408286249981902},"da-bornholm":{"precision":0.12769630591630593,"recall":0.188,"f1":0.1408286249981902,"accuracy":0.188,"main_score":0.1408286249981902}},"main_score":"f1"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/DKHate.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/DKHate.json
@@ -0,0 +1 @@
+{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.0.3.dev0","time_of_run":"2023-07-30T13:55:26.414673","scores":{"da":{"accuracy":0.5936170212765958,"f1":0.48062030395159827,"ap":0.8897617189814045,"accuracy_stderr":0.09420580255043687,"f1_stderr":0.05292797998084632,"ap_stderr":0.006783234279065605,"main_score":0.5936170212765958}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/DaLAJ.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/DaLAJ.json
@@ -0,0 +1 @@
+{"task_name":"DaLAJ","task_description":"A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:32:34.602207","scores":{"sv":{"accuracy":0.5011261261261262,"f1":0.4981211967409228,"ap":0.5005828726112415,"accuracy_stderr":0.004419238033862288,"f1_stderr":0.0035063858678943075,"ap_stderr":0.0022274185349420317,"main_score":0.5011261261261262}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/Da_Political_Comments.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/Da_Political_Comments.json
@@ -0,0 +1 @@
+{"task_name":"Da Political Comments","task_description":"A dataset of Danish political comments rated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:17:00.221073","scores":{"da":{"accuracy":0.28546059933407325,"f1":0.2577317269849485,"accuracy_stderr":0.023480148626138817,"f1_stderr":0.01681654012226201,"main_score":0.28546059933407325}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/LCC.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/LCC.json
@@ -0,0 +1 @@
+{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:26.284602","scores":{"da":{"accuracy":0.4720000000000001,"f1":0.4564433994886203,"accuracy_stderr":0.03512517299285198,"f1_stderr":0.029213561687871915,"main_score":0.4720000000000001}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/Language_Identification.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/Language_Identification.json
@@ -0,0 +1 @@
+{"task_name":"Language Identification","task_description":"A dataset for Nordic language identification.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:26:29.341469","scores":{"da":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"sv":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"nb":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"nn":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"is":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"fo":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/Massive_Intent.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/Massive_Intent.json
@@ -0,0 +1 @@
+{"task_name":"Massive Intent","task_description":"MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:20:45.850492","scores":{"da":{"accuracy":0.42844653665097515,"f1":0.42183077398989566,"accuracy_stderr":0.009531652284439484,"f1_stderr":0.011720697125388892,"main_score":0.42844653665097515},"nb":{"accuracy":0.42737054472091457,"f1":0.4065771240847707,"accuracy_stderr":0.012555140439491758,"f1_stderr":0.011758439799234426,"main_score":0.42737054472091457},"sv":{"accuracy":0.6910894418291863,"f1":0.6651160698998817,"accuracy_stderr":0.013941949333232515,"f1_stderr":0.012610378122239658,"main_score":0.6910894418291863}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/Massive_Scenario.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/Massive_Scenario.json
@@ -0,0 +1 @@
+{"task_name":"Massive Scenario","task_description":"MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:22:36.232225","scores":{"da":{"accuracy":0.4964021519838601,"f1":0.48223918173555036,"accuracy_stderr":0.013060130811491842,"f1_stderr":0.010102691330202264,"main_score":0.4964021519838601},"nb":{"accuracy":0.4948890383322125,"f1":0.4762622480394999,"accuracy_stderr":0.012910151752994623,"f1_stderr":0.013032592130554148,"main_score":0.4948890383322125},"sv":{"accuracy":0.7595830531271015,"f1":0.7530102836662811,"accuracy_stderr":0.01613987895090787,"f1_stderr":0.013497075103297649,"main_score":0.7595830531271015}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/NoReC.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/NoReC.json
@@ -0,0 +1 @@
+{"task_name":"NoReC","task_description":"A Norwegian dataset for sentiment classification on review","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:27:09.39258","scores":{"nb":{"accuracy":0.43525390625,"f1":0.4148123251467906,"accuracy_stderr":0.01785160523791383,"f1_stderr":0.016923236820285816,"main_score":0.43525390625}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/Norwegian_parliament.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/Norwegian_parliament.json
@@ -0,0 +1 @@
+{"task_name":"Norwegian parliament","task_description":"Norwegian parliament speeches annotated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:30:21.169416","scores":{"nb":{"accuracy":0.5574166666666668,"f1":0.5530107857827613,"ap":0.5325918715216138,"accuracy_stderr":0.020243140127515356,"f1_stderr":0.02220607579055956,"ap_stderr":0.012785093237452189,"main_score":0.5574166666666668}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/ScaLA.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/ScaLA.json
@@ -0,0 +1 @@
+{"task_name":"ScaLA","task_description":"A linguistic acceptability task for Danish, Norwegian Bokmål Norwegian Nynorsk and Swedish.","task_version":"1.1.1.dev0","time_of_run":"2023-09-13T15:34:22.754407","scores":{"da":{"accuracy":0.501220703125,"f1":0.4994319866681282,"ap":0.5006382834998069,"accuracy_stderr":0.005056717572560542,"f1_stderr":0.004478704690331606,"ap_stderr":0.0025455551645878903,"main_score":0.501220703125},"nb":{"accuracy":0.50341796875,"f1":0.49599382125168273,"ap":0.5017502977252766,"accuracy_stderr":0.004784159653873394,"f1_stderr":0.008067901398966753,"ap_stderr":0.002431992168215134,"main_score":0.50341796875},"sv":{"accuracy":0.4984375,"f1":0.4943650186171043,"ap":0.4992456314269318,"accuracy_stderr":0.005148068497303595,"f1_stderr":0.006846332722567433,"ap_stderr":0.002557397210459417,"main_score":0.4984375},"nn":{"accuracy":0.50009765625,"f1":0.4977961797199092,"ap":0.5000778352453731,"accuracy_stderr":0.005493109808759856,"f1_stderr":0.00640345588489854,"ap_stderr":0.0027285475508133294,"main_score":0.50009765625}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/SweFAQ.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/SweFAQ.json
@@ -0,0 +1 @@
+{"task_name":"SweFAQ","task_description":"A Swedish QA dataset derived from FAQ","task_version":"0.0.1","time_of_run":"2023-09-04T19:24:19.566584","scores":{"sv":{"ndcg_at_1":0.56725,"ndcg_at_3":0.68335,"ndcg_at_5":0.71238,"ndcg_at_10":0.73295,"ndcg_at_100":0.75403,"ndcg_at_1000":0.75533,"map_at_1":0.56725,"map_at_3":0.65595,"map_at_5":0.67212,"map_at_10":0.68084,"map_at_100":0.68553,"map_at_1000":0.68559,"recall_at_1":0.56725,"recall_at_3":0.76218,"recall_at_5":0.83236,"recall_at_10":0.89474,"recall_at_100":0.99025,"recall_at_1000":1.0,"precision_at_1":0.56725,"precision_at_3":0.25406,"precision_at_5":0.16647,"precision_at_10":0.08947,"precision_at_100":0.0099,"precision_at_1000":0.001,"mrr_at_1":0.56725,"mrr_at_3":0.65595,"mrr_at_5":0.67212,"mrr_at_10":0.68084,"mrr_at_100":0.68553,"mrr_at_1000":0.68559}},"main_score":"ndcg_at_10"}
diff --git a/src/seb/cache/KBLab__sentence-bert-swedish-cased/SweReC.json b/src/seb/cache/KBLab__sentence-bert-swedish-cased/SweReC.json
@@ -0,0 +1 @@
+{"task_name":"SweReC","task_description":"A Swedish dataset for sentiment classification on review","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:32:09.364853","scores":{"sv":{"accuracy":0.7140625,"f1":0.6458174750991554,"accuracy_stderr":0.024781608357670588,"f1_stderr":0.0193427061836479,"main_score":0.7140625}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KB__bert-base-swedish-cased/Angry_Tweets.json b/src/seb/cache/KB__bert-base-swedish-cased/Angry_Tweets.json
@@ -0,0 +1 @@
+{"task_name":"Angry Tweets","task_description":"A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:30:08.745371","scores":{"da":{"accuracy":0.4458452722063037,"f1":0.4353629739721945,"accuracy_stderr":0.028880980134025704,"f1_stderr":0.029111483185951324,"main_score":0.4458452722063037}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KB__bert-base-swedish-cased/Bornholm_Parallel.json b/src/seb/cache/KB__bert-base-swedish-cased/Bornholm_Parallel.json
@@ -0,0 +1 @@
+{"task_name":"Bornholm Parallel","task_description":"Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:30:37.205614","scores":{"da":{"precision":0.0603645764405845,"recall":0.092,"f1":0.06602699130934425,"accuracy":0.092,"main_score":0.06602699130934425},"da-bornholm":{"precision":0.0603645764405845,"recall":0.092,"f1":0.06602699130934425,"accuracy":0.092,"main_score":0.06602699130934425}},"main_score":"f1"}
diff --git a/src/seb/cache/KB__bert-base-swedish-cased/DKHate.json b/src/seb/cache/KB__bert-base-swedish-cased/DKHate.json
@@ -0,0 +1 @@
+{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.0.3.dev0","time_of_run":"2023-07-30T14:01:46.078734","scores":{"da":{"accuracy":0.5553191489361702,"f1":0.45947587346589847,"ap":0.8870741597469347,"accuracy_stderr":0.08659200521947144,"f1_stderr":0.058762937812424136,"ap_stderr":0.011945599407142385,"main_score":0.5553191489361702}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KB__bert-base-swedish-cased/DaLAJ.json b/src/seb/cache/KB__bert-base-swedish-cased/DaLAJ.json
@@ -0,0 +1 @@
+{"task_name":"DaLAJ","task_description":"A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.","task_version":"1.0.3.dev0","time_of_run":"2023-07-28T11:03:11.954025","scores":{"sv":{"accuracy":0.5176801801801801,"f1":0.5152889708732695,"ap":0.5101123206773084,"accuracy_stderr":0.028431283803400806,"f1_stderr":0.029514988893198382,"ap_stderr":0.015255890577832387,"main_score":0.5176801801801801}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KB__bert-base-swedish-cased/Da_Political_Comments.json b/src/seb/cache/KB__bert-base-swedish-cased/Da_Political_Comments.json
@@ -0,0 +1 @@
+{"task_name":"Da Political Comments","task_description":"A dataset of Danish political comments rated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:31:58.639984","scores":{"da":{"accuracy":0.28546059933407325,"f1":0.2514372540209549,"accuracy_stderr":0.025401660170899606,"f1_stderr":0.01728671199739282,"main_score":0.28546059933407325}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KB__bert-base-swedish-cased/LCC.json b/src/seb/cache/KB__bert-base-swedish-cased/LCC.json
@@ -0,0 +1 @@
+{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:30:25.154348","scores":{"da":{"accuracy":0.41200000000000003,"f1":0.3942888799816736,"accuracy_stderr":0.06469071717711042,"f1_stderr":0.05673115864109014,"main_score":0.41200000000000003}},"main_score":"accuracy"}
diff --git a/src/seb/cache/KB__bert-base-swedish-cased/Language_Identification.json b/src/seb/cache/KB__bert-base-swedish-cased/Language_Identification.json
@@ -0,0 +1 @@
+{"task_name":"Language Identification","task_description":"A dataset for Nordic language identification.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T23:41:03.343376","scores":{"da":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"sv":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"nb":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"nn":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"is":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245},"fo":{"accuracy":0.6245,"f1":0.621903954791346,"accuracy_stderr":0.01292306293243035,"f1_stderr":0.012823268129236837,"main_score":0.6245}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"Angry Tweets","task_description":"A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:07.906466","scores":{"da":{"accuracy":0.44460362941738296,"f1":0.4380942035064149,"accuracy_stderr":0.02809792891547516,"f1_stderr":0.02869393997039908,"main_score":0.44460362941738296}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"Bornholm Parallel","task_description":"Danish Bornholmsk Parallel Corpus. Bornholmsk is a Danish dialect spoken on the island of Bornholm, Denmark. Historically it is a part of east Danish which was also spoken in Scania and Halland, Sweden.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:39.189398","scores":{"da":{"precision":0.12769630591630593,"recall":0.188,"f1":0.1408286249981902,"accuracy":0.188,"main_score":0.1408286249981902},"da-bornholm":{"precision":0.12769630591630593,"recall":0.188,"f1":0.1408286249981902,"accuracy":0.188,"main_score":0.1408286249981902}},"main_score":"f1"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.0.3.dev0","time_of_run":"2023-07-30T13:55:26.414673","scores":{"da":{"accuracy":0.5936170212765958,"f1":0.48062030395159827,"ap":0.8897617189814045,"accuracy_stderr":0.09420580255043687,"f1_stderr":0.05292797998084632,"ap_stderr":0.006783234279065605,"main_score":0.5936170212765958}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"DaLAJ","task_description":"A Swedish dataset for linguistic accebtablity. Available as a part of Superlim.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:32:34.602207","scores":{"sv":{"accuracy":0.5011261261261262,"f1":0.4981211967409228,"ap":0.5005828726112415,"accuracy_stderr":0.004419238033862288,"f1_stderr":0.0035063858678943075,"ap_stderr":0.0022274185349420317,"main_score":0.5011261261261262}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"Da Political Comments","task_description":"A dataset of Danish political comments rated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:17:00.221073","scores":{"da":{"accuracy":0.28546059933407325,"f1":0.2577317269849485,"accuracy_stderr":0.023480148626138817,"f1_stderr":0.01681654012226201,"main_score":0.28546059933407325}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:15:26.284602","scores":{"da":{"accuracy":0.4720000000000001,"f1":0.4564433994886203,"accuracy_stderr":0.03512517299285198,"f1_stderr":0.029213561687871915,"main_score":0.4720000000000001}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"Language Identification","task_description":"A dataset for Nordic language identification.","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:26:29.341469","scores":{"da":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"sv":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"nb":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"nn":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"is":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666},"fo":{"accuracy":0.5144666666666666,"f1":0.5082242214165055,"accuracy_stderr":0.010790118936632086,"f1_stderr":0.013579053769348828,"main_score":0.5144666666666666}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"Massive Intent","task_description":"MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:20:45.850492","scores":{"da":{"accuracy":0.42844653665097515,"f1":0.42183077398989566,"accuracy_stderr":0.009531652284439484,"f1_stderr":0.011720697125388892,"main_score":0.42844653665097515},"nb":{"accuracy":0.42737054472091457,"f1":0.4065771240847707,"accuracy_stderr":0.012555140439491758,"f1_stderr":0.011758439799234426,"main_score":0.42737054472091457},"sv":{"accuracy":0.6910894418291863,"f1":0.6651160698998817,"accuracy_stderr":0.013941949333232515,"f1_stderr":0.012610378122239658,"main_score":0.6910894418291863}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"Massive Scenario","task_description":"MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:22:36.232225","scores":{"da":{"accuracy":0.4964021519838601,"f1":0.48223918173555036,"accuracy_stderr":0.013060130811491842,"f1_stderr":0.010102691330202264,"main_score":0.4964021519838601},"nb":{"accuracy":0.4948890383322125,"f1":0.4762622480394999,"accuracy_stderr":0.012910151752994623,"f1_stderr":0.013032592130554148,"main_score":0.4948890383322125},"sv":{"accuracy":0.7595830531271015,"f1":0.7530102836662811,"accuracy_stderr":0.01613987895090787,"f1_stderr":0.013497075103297649,"main_score":0.7595830531271015}},"main_score":"accuracy"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"task_name":"NoReC","task_description":"A Norwegian dataset for sentiment classification on review","task_version":"1.0.3.dev0","time_of_run":"2023-07-27T20:27:09.39258","scores":{"nb":{"accuracy":0.43525390625,"f1":0.4148123251467906,"accuracy_stderr":0.01785160523791383,"f1_stderr":0.016923236820285816,"main_score":0.43525390625}},"main_score":"accuracy"}