Skip to content

Commit

Permalink
Merge pull request #111 from biocypher/benchmark-yaml-hash
Browse files Browse the repository at this point in the history
Benchmark-yaml-hash
  • Loading branch information
slobentanzer authored Feb 6, 2024
2 parents 480a285 + 0b9a2a8 commit 48432dc
Show file tree
Hide file tree
Showing 32 changed files with 2,174 additions and 1,199 deletions.
67 changes: 46 additions & 21 deletions benchmark/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,62 @@
def benchmark_already_executed(
model_name: str,
task: str,
subtask: str,
md5_hash: str,
) -> bool:
"""
Checks if the benchmark task and subtask test case for the model_name have already
been executed.
Checks if the benchmark task and subtask test case for the model_name have
already been executed.
Args:
task (str): The benchmark task, e.g. "biocypher_query_generation"
subtask (str): The benchmark subtask test case, e.g. "0_entities"
model_name (str): The model name, e.g. "gpt-3.5-turbo"
task (str): The benchmark task, e.g. "biocypher_query_generation"
md5_hash (str): The md5 hash of the test case, e.g.,
"72434e7a340a3f6dd047b944988491b7". It is created from the
dictionary representation of the test case.
Returns:
bool: True if the benchmark task and subtask for the model_name has
already been run, False otherwise
bool: True if the benchmark case for the model_name has already been
run, False otherwise
"""
task_results = return_or_create_result_file(task)
task_results_subset = (task_results["model_name"] == model_name) & (
task_results["subtask"] == subtask

if task_results.empty:
return False

run = (
task_results[
(task_results["model_name"] == model_name)
& (task_results["md5_hash"] == md5_hash)
].shape[0]
> 0
)
return task_results_subset.any()

return run


def skip_if_already_run(
model_name: str,
task: str,
subtask: str,
md5_hash: str,
) -> None:
"""Helper function to check if the test case is already executed.
Args:
model_name (str): The model name, e.g. "gpt-3.5-turbo"
result_files (dict[str, pd.DataFrame]): The result files
task (str): The benchmark task, e.g. "biocypher_query_generation"
subtask (str): The benchmark subtask test case, e.g. "0_single_word"
md5_hash (str): The md5 hash of the test case, e.g.,
"72434e7a340a3f6dd047b944988491b7". It is created from the
dictionary representation of the test case.
"""
if benchmark_already_executed(model_name, task, subtask):
if benchmark_already_executed(model_name, task, md5_hash):
pytest.skip(
f"benchmark {task}: {subtask} with {model_name} already executed"
f"Benchmark for {task} with hash {md5_hash} with {model_name} already executed"
)


Expand All @@ -65,26 +82,34 @@ def return_or_create_result_file(
results = pd.read_csv(file_path, header=0)
except (pd.errors.EmptyDataError, FileNotFoundError):
results = pd.DataFrame(
columns=["model_name", "subtask", "score", "iterations"]
columns=["model_name", "subtask", "score", "iterations", "md5_hash"]
)
results.to_csv(file_path, index=False)
return results


def write_results_to_file(
model_name: str, subtask: str, score: str, iterations: str, file_path: str
model_name: str,
subtask: str,
score: str,
iterations: str,
md5_hash: str,
file_path: str,
):
"""Writes the benchmark results for the subtask to the result file.
Args:
model_name (str): The model name, e.g. "gpt-3.5-turbo"
subtask (str): The benchmark subtask test case, e.g. "entities_0"
score (str): The benchmark score, e.g. "1/1"
iterations (str): The number of iterations, e.g. "1"
subtask (str): The benchmark subtask test case, e.g. "entities"
score (str): The benchmark score, e.g. "5"
iterations (str): The number of iterations, e.g. "7"
md5_hash (str): The md5 hash of the test case
file_path (str): The path to the result file
"""
results = pd.read_csv(file_path, header=0)
new_row = pd.DataFrame(
[[model_name, subtask, score, iterations]], columns=results.columns
[[model_name, subtask, score, iterations, md5_hash]],
columns=results.columns,
)
results = pd.concat([results, new_row], ignore_index=True).sort_values(
by=["model_name", "subtask"]
Expand Down
90 changes: 45 additions & 45 deletions benchmark/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,20 @@
13,
70,
],
"model_format": "ggmlv3",
"model_format": "ggufv2",
"quantization": [
"q2_K",
# "q3_K_L",
"q3_K_M",
# "q3_K_S",
"q4_0",
"q4_1",
"q4_K_M",
"q4_K_S",
"q5_0",
# "q5_1",
"q5_K_M",
# "q5_K_S",
"q6_K",
"q8_0",
"Q2_K",
# "Q3_K_S",
"Q3_K_M",
# "Q3_K_L",
# "Q4_0",
# "Q4_K_S",
"Q4_K_M",
# "Q5_0",
# "Q5_K_S",
"Q5_K_M",
"Q6_K",
"Q8_0",
],
},
"mixtral-instruct-v0.1": {
Expand All @@ -55,12 +53,12 @@
"model_format": "ggufv2",
"quantization": [
"Q2_K",
# "Q3_K_M",
"Q4_0",
"Q3_K_M",
# "Q4_0",
"Q4_K_M",
"Q5_0",
# "Q5_K_M",
# "Q6_K",
"Q6_K",
"Q8_0",
],
},
Expand All @@ -83,6 +81,21 @@
BENCHMARK_URL = "http://localhost:9997"


def pytest_collection_modifyitems(items):
"""
Pytest hook function to modify the collected test items.
Called once after collection has been performed.
Used here to order items by their `callspec.id` (which starts with the
model name and configuration) to ensure running all tests for one model
before moving to the next model.
"""

items.sort(
key=lambda item: (item.callspec.id if hasattr(item, "callspec") else "")
)


# parameterise tests to run for each model
@pytest.fixture(params=BENCHMARKED_MODELS)
def model_name(request):
Expand Down Expand Up @@ -249,25 +262,27 @@ def result_files():
if file.endswith(".csv")
]
result_files = {}
result_columns = [
"model_name",
"subtask",
"score",
"iterations",
"md5_hash",
]
for file in RESULT_FILES:
try:
result_file = pd.read_csv(file, header=0)
except (pd.errors.EmptyDataError, FileNotFoundError):
result_file = pd.DataFrame(
columns=["model_name", "subtask", "score", "iterations"]
columns=result_columns,
)
result_file.to_csv(file, index=False)

if not np.array_equal(
result_file.columns,
["model_name", "subtask", "score", "iterations"],
result_columns,
):
result_file.columns = [
"model_name",
"subtask",
"score",
"iterations",
]
result_file.columns = result_columns

result_files[file] = result_file

Expand All @@ -281,36 +296,21 @@ def pytest_generate_tests(metafunc):
If fixture is part of test declaration, the test is parametrized.
"""
# Load the data file
data_file = BENCHMARK_DATASET["./data/benchmark_data.csv"]
data_file["index"] = data_file.index

# Initialize a dictionary to collect rows for each test type
test_rows = {
"biocypher_query_generation": [],
"rag_interpretation": [],
"text_extraction": [],
}

# Iterate over each row in the DataFrame
for index, row in data_file.iterrows():
test_type = row["test_type"]
if test_type in test_rows:
# Add the row to the list for this test type
test_rows[test_type].append(row)
data_file = BENCHMARK_DATASET["./data/benchmark_data.yaml"]

# Parametrize the fixtures with the collected rows
if "test_data_biocypher_query_generation" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_biocypher_query_generation",
test_rows["biocypher_query_generation"],
data_file["biocypher_query_generation"],
)
if "test_data_rag_interpretation" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_rag_interpretation",
test_rows["rag_interpretation"],
data_file["rag_interpretation"],
)
if "test_data_text_extraction" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_text_extraction",
test_rows["text_extraction"],
data_file["text_extraction"],
)
8 changes: 0 additions & 8 deletions benchmark/data/benchmark_data.csv

This file was deleted.

Loading

0 comments on commit 48432dc

Please sign in to comment.