Skip to content

Commit

Permalink
Merge pull request #8 from mehizli/develop
Browse files Browse the repository at this point in the history
Aktueller Stand
  • Loading branch information
ytehran authored May 22, 2024
2 parents 1a23b4d + 83b4905 commit ad7a0da
Show file tree
Hide file tree
Showing 10 changed files with 1,742 additions and 13 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ dist/
__pycache__/
.venv
.pytest_cache
venv/.env
.env
*.mp3
.cache
Expand Down
25 changes: 18 additions & 7 deletions benchmark/conftest.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,28 @@
import os

import requests
from dotenv import load_dotenv
from xinference.client import Client
import pytest

import numpy as np
import pandas as pd

from biochatter.prompts import BioCypherPromptEngine
from benchmark.load_dataset import get_benchmark_dataset
from .load_dataset import get_benchmark_dataset
from biochatter.llm_connect import GptConversation, XinferenceConversation
from .benchmark_utils import benchmark_already_executed

# how often should each benchmark be run?
N_ITERATIONS = 5
N_ITERATIONS = 1

# which dataset should be used for benchmarking?
BENCHMARK_DATASET = get_benchmark_dataset()

# which models should be benchmarked?
OPENAI_MODEL_NAMES = [
"gpt-3.5-turbo-0613",
"gpt-3.5-turbo-0125",
"gpt-4-0613",
"gpt-4-0125-preview",
"gpt-3.5-turbo-0125"
#"gpt-4-0613"
]

XINFERENCE_MODELS = {
Expand Down Expand Up @@ -148,7 +147,7 @@
for quantization in XINFERENCE_MODELS[model_name]["quantization"]
]

BENCHMARKED_MODELS = OPENAI_MODEL_NAMES + XINFERENCE_MODEL_NAMES
BENCHMARKED_MODELS = OPENAI_MODEL_NAMES #+ XINFERENCE_MODEL_NAMES
BENCHMARKED_MODELS.sort()

# Xinference IP and port
Expand Down Expand Up @@ -233,6 +232,9 @@ def conversation(request, model_name):
prompts={},
correct=False,
)
# delete first dots if venv is in project env
cus_path = os.getcwd() + "../../venv/bin/.env"
load_dotenv(cus_path)
conversation.set_api_key(
os.getenv("OPENAI_API_KEY"), user="benchmark_user"
)
Expand Down Expand Up @@ -304,6 +306,9 @@ def evaluation_conversation():
prompts={},
correct=False,
)
# delete first dots if venv is in project env
cus_path = os.getcwd() + "../../venv/bin/.env"
load_dotenv(cus_path)
conversation.set_api_key(os.getenv("OPENAI_API_KEY"), user="benchmark_user")
return conversation

Expand Down Expand Up @@ -396,6 +401,12 @@ def pytest_generate_tests(metafunc):
"test_data_text_extraction",
data_file["text_extraction"],
)
if "test_data_correctness" in metafunc.fixturenames:
metafunc.parametrize(
"test_data_correctness",
data_file["correctness"],
)



@pytest.fixture
Expand Down
1,625 changes: 1,625 additions & 0 deletions benchmark/data/benchmark_data.yaml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion benchmark/load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _load_test_data_from_this_repository():
test_data = {}
for file_path in files_in_directory:
if file_path.endswith(".yaml"):
with open(file_path, "r") as stream:
with open(file_path, "r", encoding='utf-8') as stream:
try:
yaml_data = yaml.safe_load(stream)

Expand Down
6 changes: 6 additions & 0 deletions benchmark/results/explicit_relevance_of_single_fragments.csv
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,17 @@ code-llama-instruct:7:ggufv2:Q8_0,explicit_relevance_no_repeat_instruction,1.0/1
code-llama-instruct:7:ggufv2:Q8_0,explicit_relevance_no_simple,1.0/1,5,bf26b8241de3470cd9a406aea0992fb2,2024-02-10 11:50:25
code-llama-instruct:7:ggufv2:Q8_0,explicit_relevance_yes,1.0/1,5,1f7a20371c9b65790b9b8e8be116b246,2024-02-10 11:48:39
gpt-3.5-turbo-0125,explicit_evaluation_no,1.0/1,5,d15e0094569f8df146459b50a781fc55,2024-02-12 08:31:21
gpt-3.5-turbo-0125,explicit_evaluation_no,1.0/1,1,d15e0094569f8df146459b50a781fc55,2024-04-29 14:13:47
gpt-3.5-turbo-0125,explicit_evaluation_yes,1.0/1,5,1773602eac8037fbea015069d5f15cd2,2024-02-12 08:31:24
gpt-3.5-turbo-0125,explicit_evaluation_yes,1.0/1,1,1773602eac8037fbea015069d5f15cd2,2024-04-29 14:13:49
gpt-3.5-turbo-0125,explicit_relevance_no_more_explicit,1.0/1,5,8dddcfc1314f6f671d59a3a90c95e3c0,2024-02-12 08:31:50
gpt-3.5-turbo-0125,explicit_relevance_no_more_explicit,1.0/1,1,8dddcfc1314f6f671d59a3a90c95e3c0,2024-04-29 14:13:58
gpt-3.5-turbo-0125,explicit_relevance_no_repeat_instruction,1.0/1,5,1ca6c04890597e4ece0eb8ad632f3f75,2024-02-12 08:31:53
gpt-3.5-turbo-0125,explicit_relevance_no_repeat_instruction,1.0/1,1,1ca6c04890597e4ece0eb8ad632f3f75,2024-04-29 14:14:01
gpt-3.5-turbo-0125,explicit_relevance_no_simple,1.0/1,5,bf26b8241de3470cd9a406aea0992fb2,2024-02-12 08:31:46
gpt-3.5-turbo-0125,explicit_relevance_no_simple,1.0/1,1,bf26b8241de3470cd9a406aea0992fb2,2024-04-29 14:13:57
gpt-3.5-turbo-0125,explicit_relevance_yes,1.0/1,5,1f7a20371c9b65790b9b8e8be116b246,2024-02-12 08:31:18
gpt-3.5-turbo-0125,explicit_relevance_yes,1.0/1,1,1f7a20371c9b65790b9b8e8be116b246,2024-04-29 14:13:45
gpt-3.5-turbo-0613,explicit_evaluation_no,1.0/1,5,d15e0094569f8df146459b50a781fc55,2024-02-10 11:56:05
gpt-3.5-turbo-0613,explicit_evaluation_yes,1.0/1,5,1773602eac8037fbea015069d5f15cd2,2024-02-10 11:56:08
gpt-3.5-turbo-0613,explicit_relevance_no_more_explicit,1.0/1,5,8dddcfc1314f6f671d59a3a90c95e3c0,2024-02-10 11:56:42
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ code-llama-instruct:7:ggufv2:Q6_K,implicit_relevance_yes,1.0/1,5,f9d749647929fcb
code-llama-instruct:7:ggufv2:Q8_0,implicit_relevance_no,0.0/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-10 11:49:53
code-llama-instruct:7:ggufv2:Q8_0,implicit_relevance_yes,1.0/1,5,f9d749647929fcb55321c614a3bf8d20,2024-02-10 11:50:22
gpt-3.5-turbo-0125,implicit_relevance_no,0.8/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-12 08:31:33
gpt-3.5-turbo-0125,implicit_relevance_no,1.0/1,1,b24fb31fd761b0f3e308bebd70ce4277,2024-04-29 14:13:53
gpt-3.5-turbo-0125,implicit_relevance_yes,1.0/1,5,f9d749647929fcb55321c614a3bf8d20,2024-02-12 08:31:42
gpt-3.5-turbo-0125,implicit_relevance_yes,1.0/1,1,f9d749647929fcb55321c614a3bf8d20,2024-04-29 14:13:56
gpt-3.5-turbo-0613,implicit_relevance_no,1.0/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-10 11:56:18
gpt-3.5-turbo-0613,implicit_relevance_yes,1.0/1,5,f9d749647929fcb55321c614a3bf8d20,2024-02-10 11:56:37
gpt-4-0125-preview,implicit_relevance_no,0.0/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-12 08:38:56
Expand Down
75 changes: 75 additions & 0 deletions benchmark/test_correctness_medical_usecase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import inspect
import re

import pytest

from biochatter._misc import ensure_iterable
from .conftest import calculate_test_score
from .benchmark_utils import (
skip_if_already_run,
get_result_file_path,
write_results_to_file,
)


def test_correctness_of_answers(
model_name,
test_data_correctness,
conversation,
multiple_testing,
):
yaml_data = test_data_correctness
task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
# Wieder einkommentieren, wenn benötigt
skip_if_already_run(
model_name=model_name, task=task, md5_hash=yaml_data["hash"]
)


def run_test():
conversation.reset() # needs to be reset for each test
[
conversation.append_system_message(m)
for m in yaml_data["input"]["system_messages"]
]
response, _, _ = conversation.query(yaml_data["input"]["prompt"])

# lower case, remove punctuation
response = (
response.lower().replace(".", "").replace("?", "").replace("!", "")
).strip()

print(yaml_data["case"])
print(response)
# print(get_result_file_path(task))

# calculate score of correct answers
score = []

# calculate for answers without regex
if "regex" not in yaml_data["case"]:
score.append(response == yaml_data["expected"]["answer"])

# calculate for answers with regex
else:
expected_word_pairs = yaml_data["expected"]["words_in_response"]
for pair in expected_word_pairs:
regex = "|".join(pair)
if re.search(regex, response, re.IGNORECASE):
# print(f"Expected words '{pair}' found in response: {response}")
score.append(True)
else:
score.append(False)

return calculate_test_score(score)

mean_score, max, n_iterations = multiple_testing(run_test)

write_results_to_file(
model_name,
yaml_data["case"],
f"{mean_score}/{max}",
f"{n_iterations}",
yaml_data["hash"],
get_result_file_path(task),
)
10 changes: 5 additions & 5 deletions benchmark/test_rag_interpretation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from biochatter._misc import ensure_iterable
from .conftest import calculate_test_score
from .benchmark_utils import (
skip_if_already_run,
#skip_if_already_run,
get_result_file_path,
write_results_to_file,
)
Expand All @@ -19,9 +19,9 @@ def test_explicit_relevance_of_single_fragments(
):
yaml_data = test_data_rag_interpretation
task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
skip_if_already_run(
'''skip_if_already_run(
model_name=model_name, task=task, md5_hash=yaml_data["hash"]
)
)'''
if "explicit" not in yaml_data["case"]:
pytest.skip(
f"test case {yaml_data['case']} not supported for {task} benchmark"
Expand Down Expand Up @@ -67,9 +67,9 @@ def test_implicit_relevance_of_multiple_fragments(
):
yaml_data = test_data_rag_interpretation
task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
skip_if_already_run(
'''skip_if_already_run(
model_name=model_name, task=task, md5_hash=yaml_data["hash"]
)
)'''
if "implicit" not in yaml_data["case"]:
pytest.skip(
f"test case {yaml_data['case']} not supported for {task} benchmark"
Expand Down
9 changes: 9 additions & 0 deletions openAIKey.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os
from dotenv import load_dotenv


from openai import OpenAI
# delete dots if venv is in project env
cus_path = os.getcwd() + "../venv/bin/.env"
load_dotenv(cus_path)
print(os.getenv("OPENAI_API_KEY"))
File renamed without changes.

0 comments on commit ad7a0da

Please sign in to comment.