biocypher · slobentanzer · Dec 13, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -17,20 +17,24 @@
 from .load_dataset import get_benchmark_dataset
 
 # how often should each benchmark be run?
-N_ITERATIONS = 1
+N_ITERATIONS = 3
 
 # which dataset should be used for benchmarking?
 BENCHMARK_DATASET = get_benchmark_dataset()
 
 # which models should be benchmarked?
 OPENAI_MODEL_NAMES = [
-    # "gpt-3.5-turbo-0125",
-    # "gpt-4-0613",
-    # "gpt-4-0125-preview",
-    # "gpt-4-turbo-2024-04-09",
+    "gpt-3.5-turbo-0125",
+    "gpt-4-0613",
+    "gpt-4-1106-preview",
+    "gpt-4-0125-preview",
+    "gpt-4-turbo-2024-04-09",
     # "gpt-4o-2024-05-13",
     "gpt-4o-2024-08-06",
-    # "gpt-4o-mini-2024-07-18",
+    "gpt-4o-2024-11-20",
+    "gpt-4o-mini-2024-07-18",
+    # "o1-preview-2024-09-12",
+    # "o1-mini-2024-09-12",
 ]
 
 ANTHROPIC_MODEL_NAMES = [

diff --git a/benchmark/data/benchmark_api_calling_data.yaml b/benchmark/data/benchmark_api_calling_data.yaml
@@ -15,64 +15,76 @@
 # '.', etc., by adding two backslashes before them.
 
 api_calling:
-  - case: oncokb:braf:melanoma
-    input:
-      prompt:
-        exact_spelling: "What is the consequence of the V600E BRAF variant in Melanoma?"
-    expected:
-      parts_of_query:
-        [
-          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
-          "hugoSymbol=BRAF",
-          "alteration=V600E",
-          "tumorType=Melanoma",
-        ]
-  - case: oncokb:tp53:colon_adenocarcinoma
-    input:
-      prompt:
-        exact_spelling: "What is the consequence of the R273C TP53 variant in Colon Adenocarcinoma?"
-    expected:
-      parts_of_query:
-        [
-          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
-          "hugoSymbol=TP53",
-          "alteration=R273C",
-          "tumorType=Colon%20Adenocarcinoma",
-        ]
-  - case: oncokb:braf:histiocytosis
-    input:
-      prompt:
-        exact_spelling: "What is the consequence of the N486_P490del BRAF variant in Histiocytosis?"
-        descriptive_spelling: "What is the consequence of an N486_P490 deletion in BRAF in Histiocytosis?"
-    expected:
-      parts_of_query:
-        [
-          "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
-          "hugoSymbol=BRAF",
-          "alteration=N486_P490del",
-          "tumorType=Histiocytosis",
-        ]
-  - case: oncokb:ros1:lung_adenocarcinoma
-    input:
-      prompt:
-        exact_spelling: "What is the consequence of the functional fusion of CD47 and ROS1 in Lung Adenocarcinoma?"
-    expected:
-      parts_of_query:
-        [
-          "https://demo.oncokb.org/api/v1/annotate/structuralVariants\\?",
-          "hugoSymbolA=CD74",
-          "hugoSymbolB=ROS1",
-          "structuralVariantType=FUSION",
-          "isFunctionalFusion=true",
-          "tumorType=Lung%20Adenocarcinoma",
-        ]
-  - case: biotools:topic:metabolomics
-    input:
-      prompt:
-        fuzzy_search: "Which tools can I use for metabolomics?"
-    expected:
-      parts_of_query:
-        ["https://bio.tools/api/t/", "\\?topic=", "[mM]etabolomics"]
+  # - case: oncokb:braf:melanoma
+  #   input:
+  #     prompt:
+  #       exact_spelling: "What is the consequence of the V600E BRAF variant in Melanoma?"
+  #   expected:
+  #     parts_of_query:
+  #       [
+  #         "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
+  #         "hugoSymbol=BRAF",
+  #         "alteration=V600E",
+  #         "tumorType=Melanoma",
+  #       ]
+  # - case: oncokb:tp53:colon_adenocarcinoma
+  #   input:
+  #     prompt:
+  #       exact_spelling: "What is the consequence of the R273C TP53 variant in Colon Adenocarcinoma?"
+  #   expected:
+  #     parts_of_query:
+  #       [
+  #         "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
+  #         "hugoSymbol=TP53",
+  #         "alteration=R273C",
+  #         "tumorType=Colon%20Adenocarcinoma",
+  #       ]
+  # - case: oncokb:braf:histiocytosis
+  #   input:
+  #     prompt:
+  #       exact_spelling: "What is the consequence of the N486_P490del BRAF variant in Histiocytosis?"
+  #       descriptive_spelling: "What is the consequence of an N486_P490 deletion in BRAF in Histiocytosis?"
+  #   expected:
+  #     parts_of_query:
+  #       [
+  #         "https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange\\?",
+  #         "hugoSymbol=BRAF",
+  #         "alteration=N486_P490del",
+  #         "tumorType=Histiocytosis",
+  #       ]
+  # - case: oncokb:ros1:lung_adenocarcinoma
+  #   input:
+  #     prompt:
+  #       exact_spelling: "What is the consequence of the functional fusion of CD47 and ROS1 in Lung Adenocarcinoma?"
+  #   expected:
+  #     parts_of_query:
+  #       [
+  #         "https://demo.oncokb.org/api/v1/annotate/structuralVariants\\?",
+  #         "hugoSymbolA=CD74",
+  #         "hugoSymbolB=ROS1",
+  #         "structuralVariantType=FUSION",
+  #         "isFunctionalFusion=true",
+  #         "tumorType=Lung%20Adenocarcinoma",
+  #       ]
+  # - case: biotools:topic:metabolomics
+  #   input:
+  #     prompt:
+  #       fuzzy_search: "Which tools can I use for metabolomics?"
+  #   expected:
+  #     parts_of_query:
+  #       ["https://bio.tools/api/t/", "\\?topic=", "[mM]etabolomics"]
+  # - case: scanpy:tl:leiden
+  #   input:
+  #     prompt:
+  #       explicit_variable_names: "Perform Leiden clustering on the data with resolution 0.5."
+  #   expected:
+  #     parts_of_query: ["sc.tl.leiden\\(", "resolution=0.5", "\\)"]
+  # - case: scanpy:tl:umap
+  #   input:
+  #     prompt:
+  #       explicit_variable_names: "Calculate UMAP embedding with minimum distance 0.3 and spread 1.0."
+  #   expected:
+  #     parts_of_query: ["sc.tl.umap\\(", "min_dist=0.3", "spread=1.0", "\\)"]
   - case: scanpy:tl:leiden
     input:
       prompt:
@@ -160,6 +172,69 @@ api_calling:
   - case: anndata:read:h5ad
     input:
       prompt:
-        explicit_variable_names: "read test.h5ad into an anndata object."
+        explicit_variable_names: "Use AnnData to load the file test.h5ad into an AnnData object."
+        specific: "Load test.h5ad using AnnData."
+        abbreviation: "Read test.h5ad with AnnData."
+        general: "Open an H5AD file and load it as an AnnData object."
+        help_request: "How do I read test.h5ad into an AnnData object?"
     expected:
-      parts_of_query: ["anndata.read_h5ad\\(", "filename=test.h5ad", "\\)"]
+      parts_of_query: ["anndata.io.read_h5ad\\(", "filename=test.h5ad", "\\)"]
+  - case: anndata:read:csv
+    input:
+      prompt:
+        explicit_variable_names: "Use AnnData to load the file `test.csv` into an AnnData object."
+        specific: "Load test.csv using AnnData."
+        abbreviation: "Read test.csv with AnnData."
+        general: "Open a CSV file and load it as an AnnData object."
+        help_request: "How do I read test.csv into an AnnData object?"
+    expected:
+      parts_of_query: ["anndata.io.read_csv\\(", "filename=test.csv", "\\)"]
+  - case: anndata:concat:var
+    input:
+      prompt:
+        explicit_variable_names: "Concatenate adata1 and adata2 into a single AnnData object along the column axis with an inner join."
+        specific: "Join adata1 and adata2 by columns using AnnData with an inner join."
+        abbreviation: "Merge columns of adata1 and adata2 with AnnData."
+        general: "Combine two AnnData objects along the variable axis with an inner join."
+        help_request: "How do I concatenate adata1 and adata2 along columns?"
+    expected:
+      parts_of_query:
+        [
+          "anndata.concat\\(",
+          "\\[adata1, adata2\\]",
+          ", axis='var', join='inner'",
+          "\\)",
+        ]
+  - case: anndata:concat:obs
+    input:
+      prompt:
+        explicit_variable_names: "Concatenate adata1 and adata2 into a single AnnData object along the row axis with an outer join."
+        specific: "Join adata1 and adata2 by rows using AnnData with an outer join."
+        abbreviation: "Merge rows of adata1 and adata2 with AnnData."
+        general: "Combine two AnnData objects along the observation axis with an outer join."
+        help_request: "How do I concatenate adata1 and adata2 along rows?"
+    expected:
+      parts_of_query:
+        [
+          "anndata.concat\\(",
+          "\\[adata1, adata2\\]",
+          ", axis='obs', join='outer'",
+          "\\)",
+        ]
+  - case: anndata:map
+    input:
+      prompt:
+        explicit_variable_names: "Replace the values in the cell_type column of the obs attribute in adata. Replace type1 with new_type1, type2 with new_type2, and type3 with new_type3."
+        help_request: "How do I remap cell_type values to replace type1 with new_type1, type2 with new_type2, and type3 with new_type3. ?"
+    expected:
+      parts_of_query:
+        [
+          "adata.obs",
+          "\\[\"cell_type\"\\]",
+          "\\.map\\(",
+          "\\{\\s*\"type1\": \"new_type1\"",
+          "\\s*\"type2\": \"new_type2\"",
+          "\\s*\"type3\": \"new_type3\"",
+          "\\s*\\}\\)",
+          "\\.copy\\(\\)",
+        ]
diff --git a/benchmark/results/processed/api_calling_benchmark_summary.csv b/benchmark/results/processed/api_calling_benchmark_summary.csv
@@ -0,0 +1,3 @@
+benchmark_type,mean,std,min,max,count
+Python API,0.474,0.293,0.0,0.8,367
+Python API (Reduced),0.753,0.046,0.6,0.8,191
diff --git a/benchmark/results/processed/api_calling_summary.csv b/benchmark/results/processed/api_calling_summary.csv
@@ -0,0 +1,17 @@
+benchmark_type,model_name,mean,std,min,max,count
+Python API,gpt-3.5-turbo-0125,0.609,0.197,0.25,0.8,46
+Python API,gpt-4-0125-preview,0.508,0.282,0.0,0.8,46
+Python API,gpt-4-0613,0.487,0.282,0.0,0.8,46
+Python API,gpt-4-1106-preview,0.61,0.219,0.0,0.8,45
+Python API,gpt-4-turbo-2024-04-09,0.27,0.293,0.0,0.8,46
+Python API,gpt-4o-2024-08-06,0.363,0.308,0.0,0.8,46
+Python API,gpt-4o-2024-11-20,0.381,0.318,0.0,0.8,46
+Python API,gpt-4o-mini-2024-07-18,0.572,0.239,0.0,0.8,46
+Python API (Reduced),gpt-3.5-turbo-0125,0.758,0.041,0.6,0.8,24
+Python API (Reduced),gpt-4-0125-preview,0.767,0.024,0.75,0.8,24
+Python API (Reduced),gpt-4-0613,0.767,0.024,0.75,0.8,24
+Python API (Reduced),gpt-4-1106-preview,0.765,0.024,0.75,0.8,23
+Python API (Reduced),gpt-4-turbo-2024-04-09,0.761,0.031,0.667,0.8,24
+Python API (Reduced),gpt-4o-2024-08-06,0.742,0.058,0.6,0.8,24
+Python API (Reduced),gpt-4o-2024-11-20,0.733,0.064,0.6,0.8,24
+Python API (Reduced),gpt-4o-mini-2024-07-18,0.733,0.064,0.6,0.8,24
diff --git a/benchmark/results/processed/api_calling_task_summary.csv b/benchmark/results/processed/api_calling_task_summary.csv
@@ -0,0 +1,9 @@
+benchmark_type,task_type,mean,std,min,max,count
+Python API,abbreviation,0.5,0.295,0.0,0.8,80
+Python API,general,0.476,0.316,0.0,0.8,79
+Python API,help_request,0.388,0.296,0.0,0.8,88
+Python API,specific,0.52,0.261,0.0,0.8,120
+Python API (Reduced),abbreviation,0.753,0.046,0.6,0.8,47
+Python API (Reduced),general,0.758,0.04,0.6,0.8,48
+Python API (Reduced),help_request,0.754,0.046,0.6,0.8,48
+Python API (Reduced),specific,0.747,0.051,0.6,0.8,48
diff --git a/benchmark/results/processed/overview-model.csv b/benchmark/results/processed/overview-model.csv
@@ -1,18 +1,20 @@
 Model name,Size,Median Accuracy,SD
-gpt-3.5-turbo-0125,175,0.79,0.2
-gpt-4o-2024-08-06,Unknown,0.78,0.24
 claude-3-opus-20240229,Unknown,0.77,0.28
+gpt-3.5-turbo-0125,175,0.77,0.19
 gpt-3.5-turbo-0613,175,0.76,0.21
 claude-3-5-sonnet-20240620,Unknown,0.76,0.28
 llama-3.1-instruct,70,0.73,0.29
 gpt-4-0613,Unknown,0.73,0.17
+gpt-4o-2024-08-06,Unknown,0.73,0.24
 llama-3.1-instruct,8,0.72,0.28
-gpt-4-turbo-2024-04-09,Unknown,0.71,0.26
-gpt-4o-mini-2024-07-18,Unknown,0.69,0.23
-gpt-4-0125-preview,Unknown,0.69,0.27
+gpt-4o-mini-2024-07-18,Unknown,0.69,0.22
+gpt-4-0125-preview,Unknown,0.69,0.26
+gpt-4-1106-preview,Unknown,0.69,0.06
 gpt-4o-2024-05-13,Unknown,0.68,0.31
+gpt-4-turbo-2024-04-09,Unknown,0.68,0.27
 llama-3-instruct,8,0.65,0.36
 openhermes-2.5,7,0.6,0.3
+gpt-4o-2024-11-20,Unknown,0.55,0.14
 chatglm3,6,0.44,0.26
 llama-2-chat,70,0.42,0.34
 code-llama-instruct,7,0.4,0.35

diff --git a/benchmark/results/processed/overview-quantisation.csv b/benchmark/results/processed/overview-quantisation.csv
@@ -1,21 +1,22 @@
 Model name,Size,Version,Quantisation,Median Accuracy,SD
-gpt-3.5-turbo-0125,175,,,0.79,0.2
-gpt-4o-2024-08-06,Unknown,,,0.78,0.24
 claude-3-opus-20240229,Unknown,,,0.77,0.28
+gpt-3.5-turbo-0125,175,,,0.77,0.19
 claude-3-5-sonnet-20240620,Unknown,,,0.76,0.28
 gpt-3.5-turbo-0613,175,,,0.76,0.21
 llama-3.1-instruct,8,ggufv2,Q6_K,0.74,0.28
 llama-3.1-instruct,70,ggufv2,IQ2_M,0.74,0.29
 llama-3.1-instruct,8,ggufv2,Q5_K_M,0.74,0.28
 gpt-4-0613,Unknown,,,0.73,0.17
 llama-3.1-instruct,70,ggufv2,IQ4_XS,0.73,0.29
+gpt-4o-2024-08-06,Unknown,,,0.73,0.24
 llama-3.1-instruct,8,ggufv2,Q8_0,0.72,0.3
-gpt-4-turbo-2024-04-09,Unknown,,,0.71,0.26
 llama-3.1-instruct,8,ggufv2,Q3_K_L,0.71,0.28
 llama-3.1-instruct,8,ggufv2,Q4_K_M,0.7,0.26
 llama-3.1-instruct,8,ggufv2,IQ4_XS,0.69,0.28
-gpt-4-0125-preview,Unknown,,,0.69,0.27
-gpt-4o-mini-2024-07-18,Unknown,,,0.69,0.23
+gpt-4-1106-preview,Unknown,,,0.69,0.06
+gpt-4-0125-preview,Unknown,,,0.69,0.26
+gpt-4o-mini-2024-07-18,Unknown,,,0.69,0.22
+gpt-4-turbo-2024-04-09,Unknown,,,0.68,0.27
 gpt-4o-2024-05-13,Unknown,,,0.68,0.31
 llama-3.1-instruct,70,ggufv2,Q3_K_S,0.67,0.28
 llama-3-instruct,8,ggufv2,Q8_0,0.65,0.35
@@ -27,6 +28,7 @@ openhermes-2.5,7,ggufv2,Q5_K_M,0.6,0.29
 openhermes-2.5,7,ggufv2,Q8_0,0.6,0.3
 openhermes-2.5,7,ggufv2,Q4_K_M,0.6,0.3
 openhermes-2.5,7,ggufv2,Q3_K_M,0.56,0.3
+gpt-4o-2024-11-20,Unknown,,,0.55,0.14
 code-llama-instruct,34,ggufv2,Q2_K,0.5,0.33
 openhermes-2.5,7,ggufv2,Q2_K,0.5,0.28
 code-llama-instruct,7,ggufv2,Q3_K_M,0.49,0.31