Skip to content

Commit

Permalink
feat: add Benchmark-VHP (#207)
Browse files Browse the repository at this point in the history
This PR introduces a new benchmark context, as well as some behavioural changes to the prompt generator module. As such, will increase patch version to reflect the change.

* add safety KG and two questions

* convert node names to PascalCase internally

* change n

* add complex case

* make sure that selected entities are PascalCase

* create plot only for the safety questions

* run a limited set of tests for the hackathon

* refactor: split data file into multiple
any benchmark file should end in _data.yaml

* comment

* run iteration of benchmark, including new questions and new openAI model

* add gpt-4o-2024-08-06 to docs hook

* benchmark results
  • Loading branch information
slobentanzer authored Oct 25, 2024
1 parent aee6a75 commit ef6a4bd
Show file tree
Hide file tree
Showing 77 changed files with 2,537 additions and 1,103 deletions.
169 changes: 85 additions & 84 deletions benchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@

# which models should be benchmarked?
OPENAI_MODEL_NAMES = [
# "gpt-3.5-turbo-0125",
# "gpt-4-0613",
# "gpt-4-0125-preview",
# "gpt-4-turbo-2024-04-09",
# "gpt-4o-2024-05-13",
# "gpt-4o-mini-2024-07-18",
"gpt-3.5-turbo-0125",
"gpt-4-0613",
"gpt-4-0125-preview",
"gpt-4-turbo-2024-04-09",
"gpt-4o-2024-05-13",
"gpt-4o-2024-08-06",
"gpt-4o-mini-2024-07-18",
]

ANTHROPIC_MODEL_NAMES = [
Expand All @@ -40,8 +41,8 @@
XINFERENCE_MODELS = {
# "code-llama-instruct": {
# "model_size_in_billions": [
# 7,
# 13,
# # 7,
# # 13,
# 34,
# ],
# "model_format": "ggufv2",
Expand Down Expand Up @@ -127,28 +128,28 @@
# # "FP16",
# ],
# },
# "llama-2-chat": {
# "model_size_in_billions": [
# 7,
# 13,
# # 70,
# ],
# "model_format": "ggufv2",
# "quantization": [
# "Q2_K",
# # "Q3_K_S",
# "Q3_K_M",
# # "Q3_K_L",
# # "Q4_0",
# # "Q4_K_S",
# "Q4_K_M",
# # "Q5_0",
# # "Q5_K_S",
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
# ],
# },
"llama-2-chat": {
"model_size_in_billions": [
7,
# 13,
# 70,
],
"model_format": "ggufv2",
"quantization": [
"Q2_K",
# "Q3_K_S",
"Q3_K_M",
# "Q3_K_L",
# "Q4_0",
# "Q4_K_S",
"Q4_K_M",
# "Q5_0",
# "Q5_K_S",
"Q5_K_M",
"Q6_K",
"Q8_0",
],
},
# "llama-3-instruct": {
# "model_size_in_billions": [
# 8,
Expand All @@ -168,31 +169,31 @@
# # "Q4_K_M",
# ],
# },
# "llama-3.1-instruct": {
# "model_size_in_billions": [
# 8,
# # 70,
# ],
# "model_format": "ggufv2",
# "quantization": [
# # 8B model quantisations
# # "Q3_K_L",
# "IQ4_XS",
# # "Q4_K_M",
# # "Q5_K_M",
# # "Q6_K",
# # "Q8_0",
# # 70B model quantisations
# # "IQ2_M",
# # "Q2_K",
# # "Q3_K_S",
# # "IQ4_XS",
# # "Q4_K_M", # crazy slow on mbp m3 max
# # "Q5_K_M",
# # "Q6_K",
# # "Q8_0",
# ],
# },
"llama-3.1-instruct": {
"model_size_in_billions": [
8,
# 70,
],
"model_format": "ggufv2",
"quantization": [
# 8B model quantisations
"Q3_K_L",
"IQ4_XS",
"Q4_K_M",
# "Q5_K_M",
# "Q6_K",
"Q8_0",
# 70B model quantisations
# "IQ2_M",
# "Q2_K",
# "Q3_K_S",
# "IQ4_XS",
# "Q4_K_M", # crazy slow on mbp m3 max
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
],
},
# "mistral-instruct-v0.2": {
# "model_size_in_billions": [
# 7,
Expand Down Expand Up @@ -238,26 +239,26 @@
# "none",
# ],
# },
# "openhermes-2.5": {
# "model_size_in_billions": [
# 7,
# ],
# "model_format": "ggufv2",
# "quantization": [
# "Q2_K",
# # "Q3_K_S",
# "Q3_K_M",
# # "Q3_K_L",
# # "Q4_0",
# # "Q4_K_S",
# "Q4_K_M",
# # "Q5_0",
# # "Q5_K_S",
# "Q5_K_M",
# "Q6_K",
# "Q8_0",
# ],
# },
"openhermes-2.5": {
"model_size_in_billions": [
7,
],
"model_format": "ggufv2",
"quantization": [
"Q2_K",
# "Q3_K_S",
"Q3_K_M",
# "Q3_K_L",
# "Q4_0",
# "Q4_K_S",
"Q4_K_M",
# "Q5_0",
# "Q5_K_S",
"Q5_K_M",
"Q6_K",
"Q8_0",
],
},
}

# create concrete benchmark list by concatenating all combinations of model
Expand Down Expand Up @@ -546,38 +547,38 @@ def pytest_generate_tests(metafunc):
Called once for each test case in the benchmark test collection.
If fixture is part of test declaration, the test is parametrized.
"""
# Load the data file
data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
# Load the data
data = BENCHMARK_DATASET

# Parametrize the fixtures with the collected rows
if "test_data_biocypher_query_generation" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_biocypher_query_generation",
data_file["biocypher_query_generation"],
data["biocypher_query_generation"],
)
if "test_data_rag_interpretation" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_rag_interpretation",
data_file["rag_interpretation"],
data["rag_interpretation"],
)
if "test_data_text_extraction" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_text_extraction",
data_file["text_extraction"],
data["text_extraction"],
)
if "test_data_api_calling" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_api_calling",
data_file["api_calling"],
data["api_calling"],
)
if "test_data_medical_exam" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_medical_exam",
data_file["medical_exam"],
data["medical_exam"],
)


@pytest.fixture
def kg_schemas():
data_file = BENCHMARK_DATASET["benchmark_data.yaml"]
return data_file["kg_schemas"]
data = BENCHMARK_DATASET
return data["kg_schemas"]
62 changes: 62 additions & 0 deletions benchmark/data/benchmark_api_calling_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Top-level keys: benchmark modules
# Values: list of dictionaries, each containing a test case
#
# Test case keys:
# - input (for creating the test)
# - expected (for asserting ourcomes and generating a score)
# - case (for categorizing the test case)
#
# If any input is a dictionary itself, it will be expanded into separate test
# cases, using the top-level key to create a concatenated test case purpose.

api_calling:
- case: oncokb:braf:melanoma
input:
prompt:
exact_spelling: "What is the consequence of the V600E BRAF variant in Melanoma?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
"hugoSymbol=BRAF",
"alteration=V600E",
"tumorType=Melanoma",
]
- case: oncokb:tp53:colon_adenocarcinoma
input:
prompt:
exact_spelling: "What is the consequence of the R273C TP53 variant in Colon Adenocarcinoma?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?hugoSymbol=TP53",
"alteration=R273C",
"tumorType=Colon%20Adenocarcinoma",
]
- case: oncokb:braf:histiocytosis
input:
prompt:
exact_spelling: "What is the consequence of the N486_P490del BRAF variant in Histiocytosis?"
descriptive_spelling: "What is the consequence of an N486_P490 deletion in BRAF in Histiocytosis?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/mutations/byProteinChange?",
"hugoSymbol=BRAF",
"alteration=N486_P490del",
"tumorType=Histiocytosis",
]
- case: oncokb:ros1:lung_adenocarcinoma
input:
prompt:
exact_spelling: "What is the consequence of the functional fusion of CD47 and ROS1 in Lung Adenocarcinoma?"
expected:
parts_of_query:
[
"https://demo.oncokb.org/api/v1/annotate/structuralVariants?",
"hugoSymbolA=CD74",
"hugoSymbolB=ROS1",
"structuralVariantType=FUSION",
"isFunctionalFusion=true",
"tumorType=Lung%20Adenocarcinoma",
]
Loading

0 comments on commit ef6a4bd

Please sign in to comment.