feat: add Benchmark-VHP (#207)

This PR introduces a new benchmark context, as well as some behavioural changes to the prompt generator module. As such, will increase patch version to reflect the change. * add safety KG and two questions * convert node names to PascalCase internally * change n * add complex case * make sure that selected entities are PascalCase * create plot only for the safety questions * run a limited set of tests for the hackathon * refactor: split data file into multiple any benchmark file should end in _data.yaml * comment * run iteration of benchmark, including new questions and new openAI model * add gpt-4o-2024-08-06 to docs hook * benchmark results
biocypher · Oct 25, 2024 · ef6a4bd · ef6a4bd
1 parent aee6a75
commit ef6a4bd
Show file tree

Hide file tree

Showing 77 changed files with 2,537 additions and 1,103 deletions.
diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -24,12 +24,13 @@
 
 # which models should be benchmarked?
 OPENAI_MODEL_NAMES = [
-    # "gpt-3.5-turbo-0125",
-    # "gpt-4-0613",
-    # "gpt-4-0125-preview",
-    # "gpt-4-turbo-2024-04-09",
-    # "gpt-4o-2024-05-13",
-    # "gpt-4o-mini-2024-07-18",
+    "gpt-3.5-turbo-0125",
+    "gpt-4-0613",
+    "gpt-4-0125-preview",
+    "gpt-4-turbo-2024-04-09",
+    "gpt-4o-2024-05-13",
+    "gpt-4o-2024-08-06",
+    "gpt-4o-mini-2024-07-18",
 ]
 
 ANTHROPIC_MODEL_NAMES = [
@@ -40,8 +41,8 @@
 XINFERENCE_MODELS = {
     # "code-llama-instruct": {
     #     "model_size_in_billions": [
-    #         7,
-    #         13,
+    #         # 7,
+    #         # 13,
     #         34,
     #     ],
     #     "model_format": "ggufv2",
@@ -127,28 +128,28 @@
     #         # "FP16",
     #     ],
     # },
-    # "llama-2-chat": {
-    #     "model_size_in_billions": [
-    #         7,
-    #         13,
-    #         # 70,
-    #     ],
-    #     "model_format": "ggufv2",
-    #     "quantization": [
-    #         "Q2_K",
-    #         # "Q3_K_S",
-    #         "Q3_K_M",
-    #         # "Q3_K_L",
-    #         # "Q4_0",
-    #         # "Q4_K_S",
-    #         "Q4_K_M",
-    #         # "Q5_0",
-    #         # "Q5_K_S",
-    #         "Q5_K_M",
-    #         "Q6_K",
-    #         "Q8_0",
-    #     ],
-    # },
+    "llama-2-chat": {
+        "model_size_in_billions": [
+            7,
+            # 13,
+            # 70,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            "Q2_K",
+            # "Q3_K_S",
+            "Q3_K_M",
+            # "Q3_K_L",
+            # "Q4_0",
+            # "Q4_K_S",
+            "Q4_K_M",
+            # "Q5_0",
+            # "Q5_K_S",
+            "Q5_K_M",
+            "Q6_K",
+            "Q8_0",
+        ],
+    },
     # "llama-3-instruct": {
     #     "model_size_in_billions": [
     #         8,
@@ -168,31 +169,31 @@
     #         # "Q4_K_M",
     #     ],
     # },
-    # "llama-3.1-instruct": {
-    #     "model_size_in_billions": [
-    #         8,
-    #         # 70,
-    #     ],
-    #     "model_format": "ggufv2",
-    #     "quantization": [
-    #         # 8B model quantisations
-    #         # "Q3_K_L",
-    #         "IQ4_XS",
-    #         # "Q4_K_M",
-    #         # "Q5_K_M",
-    #         # "Q6_K",
-    #         # "Q8_0",
-    #         # 70B model quantisations
-    #         # "IQ2_M",
-    #         # "Q2_K",
-    #         # "Q3_K_S",
-    #         # "IQ4_XS",
-    #         # "Q4_K_M",  # crazy slow on mbp m3 max
-    #         # "Q5_K_M",
-    #         # "Q6_K",
-    #         # "Q8_0",
-    #     ],
-    # },
+    "llama-3.1-instruct": {
+        "model_size_in_billions": [
+            8,
+            # 70,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            # 8B model quantisations
+            "Q3_K_L",
+            "IQ4_XS",
+            "Q4_K_M",
+            # "Q5_K_M",
+            # "Q6_K",
+            "Q8_0",
+            # 70B model quantisations
+            # "IQ2_M",
+            # "Q2_K",
+            # "Q3_K_S",
+            # "IQ4_XS",
+            # "Q4_K_M",  # crazy slow on mbp m3 max
+            # "Q5_K_M",
+            # "Q6_K",
+            # "Q8_0",
+        ],
+    },
     # "mistral-instruct-v0.2": {
     #     "model_size_in_billions": [
     #         7,
@@ -238,26 +239,26 @@
     #         "none",
     #     ],
     # },
-    # "openhermes-2.5": {
-    #     "model_size_in_billions": [
-    #         7,
-    #     ],
-    #     "model_format": "ggufv2",
-    #     "quantization": [
-    #         "Q2_K",
-    #         # "Q3_K_S",
-    #         "Q3_K_M",
-    #         # "Q3_K_L",
-    #         # "Q4_0",
-    #         # "Q4_K_S",
-    #         "Q4_K_M",
-    #         # "Q5_0",
-    #         # "Q5_K_S",
-    #         "Q5_K_M",
-    #         "Q6_K",
-    #         "Q8_0",
-    #     ],
-    # },
+    "openhermes-2.5": {
+        "model_size_in_billions": [
+            7,
+        ],
+        "model_format": "ggufv2",
+        "quantization": [
+            "Q2_K",
+            # "Q3_K_S",
+            "Q3_K_M",
+            # "Q3_K_L",
+            # "Q4_0",
+            # "Q4_K_S",
+            "Q4_K_M",
+            # "Q5_0",
+            # "Q5_K_S",
+            "Q5_K_M",
+            "Q6_K",
+            "Q8_0",
+        ],
+    },
 }
 
 # create concrete benchmark list by concatenating all combinations of model
@@ -546,38 +547,38 @@ def pytest_generate_tests(metafunc):
     Called once for each test case in the benchmark test collection.
     If fixture is part of test declaration, the test is parametrized.
     """
-    # Load the data file
-    data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
+    # Load the data
+    data = BENCHMARK_DATASET
 
     # Parametrize the fixtures with the collected rows
     if "test_data_biocypher_query_generation" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_biocypher_query_generation",
-            data_file["biocypher_query_generation"],
+            data["biocypher_query_generation"],
         )
     if "test_data_rag_interpretation" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_rag_interpretation",
-            data_file["rag_interpretation"],
+            data["rag_interpretation"],
         )
     if "test_data_text_extraction" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_text_extraction",
-            data_file["text_extraction"],
+            data["text_extraction"],
         )
     if "test_data_api_calling" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_api_calling",
-            data_file["api_calling"],
+            data["api_calling"],
         )
     if "test_data_medical_exam" in metafunc.fixturenames:
         metafunc.parametrize(
             "test_data_medical_exam",
-            data_file["medical_exam"],
+            data["medical_exam"],
         )
 
 
 @pytest.fixture
 def kg_schemas():
-    data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
-    return data_file["kg_schemas"]
+    data = BENCHMARK_DATASET
+    return data["kg_schemas"]
diff --git a/benchmark/data/benchmark_api_calling_data.yaml b/benchmark/data/benchmark_api_calling_data.yaml
@@ -0,0 +1,62 @@
+# Top-level keys: benchmark modules
+# Values: list of dictionaries, each containing a test case
+#
+# Test case keys:
+# - input (for creating the test)
+# - expected (for asserting ourcomes and generating a score)
+# - case (for categorizing the test case)
+#
+# If any input is a dictionary itself, it will be expanded into separate test
+# cases, using the top-level key to create a concatenated test case purpose.
+
+api_calling:
+  - case: oncokb:braf:melanoma
+    input:
+      prompt:
+        exact_spelling: "What is the consequence of the V600E BRAF variant in Melanoma?"
+    expected:
+      parts_of_query:
+        [
+          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
+          "hugoSymbol=BRAF",
+          "alteration=V600E",
+          "tumorType=Melanoma",
+        ]
+  - case: oncokb:tp53:colon_adenocarcinoma
+    input:
+      prompt:
+        exact_spelling: "What is the consequence of the R273C TP53 variant in Colon Adenocarcinoma?"
+    expected:
+      parts_of_query:
+        [
+          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?hugoSymbol=TP53",
+          "alteration=R273C",
+          "tumorType=Colon%20Adenocarcinoma",
+        ]
+  - case: oncokb:braf:histiocytosis
+    input:
+      prompt:
+        exact_spelling: "What is the consequence of the N486_P490del BRAF variant in Histiocytosis?"
+        descriptive_spelling: "What is the consequence of an N486_P490 deletion in BRAF in Histiocytosis?"
+    expected:
+      parts_of_query:
+        [
+          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
+          "hugoSymbol=BRAF",
+          "alteration=N486_P490del",
+          "tumorType=Histiocytosis",
+        ]
+  - case: oncokb:ros1:lung_adenocarcinoma
+    input:
+      prompt:
+        exact_spelling: "What is the consequence of the functional fusion of CD47 and ROS1 in Lung Adenocarcinoma?"
+    expected:
+      parts_of_query:
+        [
+          "https://demo.oncokb.org/api/v1/annotate/structuralVariants?",
+          "hugoSymbolA=CD74",
+          "hugoSymbolB=ROS1",
+          "structuralVariantType=FUSION",
+          "isFunctionalFusion=true",
+          "tumorType=Lung%20Adenocarcinoma",
+        ]