Merge pull request #111 from biocypher/benchmark-yaml-hash

Benchmark-yaml-hash
biocypher · Feb 6, 2024 · 48432dc · 48432dc
2 parents 480a285 + 0b9a2a8
commit 48432dc
Show file tree

Hide file tree

Showing 32 changed files with 2,174 additions and 1,199 deletions.
diff --git a/benchmark/benchmark_utils.py b/benchmark/benchmark_utils.py
@@ -6,45 +6,62 @@
 def benchmark_already_executed(
     model_name: str,
     task: str,
-    subtask: str,
+    md5_hash: str,
 ) -> bool:
     """
-    Checks if the benchmark task and subtask test case for the model_name have already
-    been executed.
+
+    Checks if the benchmark task and subtask test case for the model_name have
+    already been executed.
 
     Args:
-        task (str): The benchmark task, e.g. "biocypher_query_generation"
-        subtask (str): The benchmark subtask test case, e.g. "0_entities"
         model_name (str): The model name, e.g. "gpt-3.5-turbo"
 
+        task (str): The benchmark task, e.g. "biocypher_query_generation"
+
+        md5_hash (str): The md5 hash of the test case, e.g.,
+            "72434e7a340a3f6dd047b944988491b7". It is created from the
+            dictionary representation of the test case.
+
     Returns:
 
-        bool: True if the benchmark task and subtask for the model_name has
-            already been run, False otherwise
+        bool: True if the benchmark case for the model_name has already been
+            run, False otherwise
     """
     task_results = return_or_create_result_file(task)
-    task_results_subset = (task_results["model_name"] == model_name) & (
-        task_results["subtask"] == subtask
+
+    if task_results.empty:
+        return False
+
+    run = (
+        task_results[
+            (task_results["model_name"] == model_name)
+            & (task_results["md5_hash"] == md5_hash)
+        ].shape[0]
+        > 0
     )
-    return task_results_subset.any()
+
+    return run
 
 
 def skip_if_already_run(
     model_name: str,
     task: str,
-    subtask: str,
+    md5_hash: str,
 ) -> None:
     """Helper function to check if the test case is already executed.
 
     Args:
         model_name (str): The model name, e.g. "gpt-3.5-turbo"
-        result_files (dict[str, pd.DataFrame]): The result files
+
         task (str): The benchmark task, e.g. "biocypher_query_generation"
-        subtask (str): The benchmark subtask test case, e.g. "0_single_word"
+
+        md5_hash (str): The md5 hash of the test case, e.g.,
+            "72434e7a340a3f6dd047b944988491b7". It is created from the
+            dictionary representation of the test case.
     """
-    if benchmark_already_executed(model_name, task, subtask):
+    if benchmark_already_executed(model_name, task, md5_hash):
         pytest.skip(
-            f"benchmark {task}: {subtask} with {model_name} already executed"
+            f"Benchmark for {task} with hash {md5_hash} with {model_name} already executed"
         )
 
 
@@ -65,26 +82,34 @@ def return_or_create_result_file(
         results = pd.read_csv(file_path, header=0)
     except (pd.errors.EmptyDataError, FileNotFoundError):
         results = pd.DataFrame(
-            columns=["model_name", "subtask", "score", "iterations"]
+            columns=["model_name", "subtask", "score", "iterations", "md5_hash"]
         )
         results.to_csv(file_path, index=False)
     return results
 
 
 def write_results_to_file(
-    model_name: str, subtask: str, score: str, iterations: str, file_path: str
+    model_name: str,
+    subtask: str,
+    score: str,
+    iterations: str,
+    md5_hash: str,
+    file_path: str,
 ):
     """Writes the benchmark results for the subtask to the result file.
 
     Args:
         model_name (str): The model name, e.g. "gpt-3.5-turbo"
-        subtask (str): The benchmark subtask test case, e.g. "entities_0"
-        score (str): The benchmark score, e.g. "1/1"
-        iterations (str): The number of iterations, e.g. "1"
+        subtask (str): The benchmark subtask test case, e.g. "entities"
+        score (str): The benchmark score, e.g. "5"
+        iterations (str): The number of iterations, e.g. "7"
+        md5_hash (str): The md5 hash of the test case
+        file_path (str): The path to the result file
     """
     results = pd.read_csv(file_path, header=0)
     new_row = pd.DataFrame(
-        [[model_name, subtask, score, iterations]], columns=results.columns
+        [[model_name, subtask, score, iterations, md5_hash]],
+        columns=results.columns,
     )
     results = pd.concat([results, new_row], ignore_index=True).sort_values(
         by=["model_name", "subtask"]

diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -30,22 +30,20 @@
             13,
             70,
         ],
-        "model_format": "ggmlv3",
+        "model_format": "ggufv2",
         "quantization": [
-            "q2_K",
-            # "q3_K_L",
-            "q3_K_M",
-            # "q3_K_S",
-            "q4_0",
-            "q4_1",
-            "q4_K_M",
-            "q4_K_S",
-            "q5_0",
-            # "q5_1",
-            "q5_K_M",
-            # "q5_K_S",
-            "q6_K",
-            "q8_0",
+            "Q2_K",
+            # "Q3_K_S",
+            "Q3_K_M",
+            # "Q3_K_L",
+            # "Q4_0",
+            # "Q4_K_S",
+            "Q4_K_M",
+            # "Q5_0",
+            # "Q5_K_S",
+            "Q5_K_M",
+            "Q6_K",
+            "Q8_0",
         ],
     },
     "mixtral-instruct-v0.1": {
@@ -55,12 +53,12 @@
         "model_format": "ggufv2",
         "quantization": [
             "Q2_K",
-            # "Q3_K_M",
-            "Q4_0",
+            "Q3_K_M",
+            # "Q4_0",
             "Q4_K_M",
             "Q5_0",
             # "Q5_K_M",
-            # "Q6_K",
+            "Q6_K",
             "Q8_0",
         ],
     },
@@ -83,6 +81,21 @@
 BENCHMARK_URL = "http://localhost:9997"
 
 
+def pytest_collection_modifyitems(items):
+    """
+    Pytest hook function to modify the collected test items.
+    Called once after collection has been performed.
+
+    Used here to order items by their `callspec.id` (which starts with the
+    model name and configuration) to ensure running all tests for one model
+    before moving to the next model.
+    """
+
+    items.sort(
+        key=lambda item: (item.callspec.id if hasattr(item, "callspec") else "")
+    )
+
+
 # parameterise tests to run for each model
 @pytest.fixture(params=BENCHMARKED_MODELS)
 def model_name(request):
@@ -249,25 +262,27 @@ def result_files():
         if file.endswith(".csv")
     ]
     result_files = {}
+    result_columns = [
+        "model_name",
+        "subtask",
+        "score",
+        "iterations",
+        "md5_hash",
+    ]
     for file in RESULT_FILES:
         try:
             result_file = pd.read_csv(file, header=0)
         except (pd.errors.EmptyDataError, FileNotFoundError):
             result_file = pd.DataFrame(
-                columns=["model_name", "subtask", "score", "iterations"]
+                columns=result_columns,
             )
             result_file.to_csv(file, index=False)
 
         if not np.array_equal(
             result_file.columns,
-            ["model_name", "subtask", "score", "iterations"],
+            result_columns,
         ):
-            result_file.columns = [
-                "model_name",
-                "subtask",
-                "score",
-                "iterations",
-            ]
+            result_file.columns = result_columns
 
         result_files[file] = result_file
 
@@ -281,36 +296,21 @@ def pytest_generate_tests(metafunc):
     If fixture is part of test declaration, the test is parametrized.
     """
     # Load the data file
-    data_file = BENCHMARK_DATASET["./data/benchmark_data.csv"]
-    data_file["index"] = data_file.index
-
-    # Initialize a dictionary to collect rows for each test type
-    test_rows = {
-        "biocypher_query_generation": [],
-        "rag_interpretation": [],
-        "text_extraction": [],
-    }
-
-    # Iterate over each row in the DataFrame
-    for index, row in data_file.iterrows():
-        test_type = row["test_type"]
-        if test_type in test_rows:
-            # Add the row to the list for this test type
-            test_rows[test_type].append(row)
+    data_file = BENCHMARK_DATASET["./data/benchmark_data.yaml"]
 
     # Parametrize the fixtures with the collected rows
     if "test_data_biocypher_query_generation" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_biocypher_query_generation",
-            test_rows["biocypher_query_generation"],
+            data_file["biocypher_query_generation"],
         )
     if "test_data_rag_interpretation" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_rag_interpretation",
-            test_rows["rag_interpretation"],
+            data_file["rag_interpretation"],
         )
     if "test_data_text_extraction" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_text_extraction",
-            test_rows["text_extraction"],
+            data_file["text_extraction"],
         )
diff --git a/benchmark/data/benchmark_data.csv b/benchmark/data/benchmark_data.csv