Merge pull request #8 from mehizli/develop

Aktueller Stand
mehizli · May 22, 2024 · ad7a0da · ad7a0da
2 parents 1a23b4d + 83b4905
commit ad7a0da
Show file tree

Hide file tree

Showing 10 changed files with 1,742 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ dist/
 __pycache__/
 .venv
 .pytest_cache
+venv/.env
 .env
 *.mp3
 .cache

diff --git a/benchmark/conftest.py b/benchmark/conftest.py
@@ -1,29 +1,28 @@
 import os
 
 import requests
+from dotenv import load_dotenv
 from xinference.client import Client
 import pytest
 
 import numpy as np
 import pandas as pd
 
 from biochatter.prompts import BioCypherPromptEngine
-from benchmark.load_dataset import get_benchmark_dataset
+from .load_dataset import get_benchmark_dataset
 from biochatter.llm_connect import GptConversation, XinferenceConversation
 from .benchmark_utils import benchmark_already_executed
 
 # how often should each benchmark be run?
-N_ITERATIONS = 5
+N_ITERATIONS = 1
 
 # which dataset should be used for benchmarking?
 BENCHMARK_DATASET = get_benchmark_dataset()
 
 # which models should be benchmarked?
 OPENAI_MODEL_NAMES = [
-    "gpt-3.5-turbo-0613",
-    "gpt-3.5-turbo-0125",
-    "gpt-4-0613",
-    "gpt-4-0125-preview",
+    "gpt-3.5-turbo-0125"
+    #"gpt-4-0613"
 ]
 
 XINFERENCE_MODELS = {
@@ -148,7 +147,7 @@
     for quantization in XINFERENCE_MODELS[model_name]["quantization"]
 ]
 
-BENCHMARKED_MODELS = OPENAI_MODEL_NAMES + XINFERENCE_MODEL_NAMES
+BENCHMARKED_MODELS = OPENAI_MODEL_NAMES #+ XINFERENCE_MODEL_NAMES
 BENCHMARKED_MODELS.sort()
 
 # Xinference IP and port
@@ -233,6 +232,9 @@ def conversation(request, model_name):
             prompts={},
             correct=False,
         )
+        # delete first dots if venv is in project env
+        cus_path = os.getcwd() + "../../venv/bin/.env"
+        load_dotenv(cus_path)
         conversation.set_api_key(
             os.getenv("OPENAI_API_KEY"), user="benchmark_user"
         )
@@ -304,6 +306,9 @@ def evaluation_conversation():
         prompts={},
         correct=False,
     )
+    # delete first dots if venv is in project env
+    cus_path = os.getcwd() + "../../venv/bin/.env"
+    load_dotenv(cus_path)
     conversation.set_api_key(os.getenv("OPENAI_API_KEY"), user="benchmark_user")
     return conversation
 
@@ -396,6 +401,12 @@ def pytest_generate_tests(metafunc):
             "test_data_text_extraction",
             data_file["text_extraction"],
         )
+    if "test_data_correctness" in metafunc.fixturenames:
+        metafunc.parametrize(
+            "test_data_correctness",
+            data_file["correctness"],
+        )
+
 
 
 @pytest.fixture

diff --git a/benchmark/data/benchmark_data.yaml b/benchmark/data/benchmark_data.yaml
diff --git a/benchmark/load_dataset.py b/benchmark/load_dataset.py
@@ -58,7 +58,7 @@ def _load_test_data_from_this_repository():
     test_data = {}
     for file_path in files_in_directory:
         if file_path.endswith(".yaml"):
-            with open(file_path, "r") as stream:
+            with open(file_path, "r", encoding='utf-8') as stream:
                 try:
                     yaml_data = yaml.safe_load(stream)
 

diff --git a/benchmark/results/explicit_relevance_of_single_fragments.csv b/benchmark/results/explicit_relevance_of_single_fragments.csv
@@ -114,11 +114,17 @@ code-llama-instruct:7:ggufv2:Q8_0,explicit_relevance_no_repeat_instruction,1.0/1
 code-llama-instruct:7:ggufv2:Q8_0,explicit_relevance_no_simple,1.0/1,5,bf26b8241de3470cd9a406aea0992fb2,2024-02-10 11:50:25
 code-llama-instruct:7:ggufv2:Q8_0,explicit_relevance_yes,1.0/1,5,1f7a20371c9b65790b9b8e8be116b246,2024-02-10 11:48:39
 gpt-3.5-turbo-0125,explicit_evaluation_no,1.0/1,5,d15e0094569f8df146459b50a781fc55,2024-02-12 08:31:21
+gpt-3.5-turbo-0125,explicit_evaluation_no,1.0/1,1,d15e0094569f8df146459b50a781fc55,2024-04-29 14:13:47
 gpt-3.5-turbo-0125,explicit_evaluation_yes,1.0/1,5,1773602eac8037fbea015069d5f15cd2,2024-02-12 08:31:24
+gpt-3.5-turbo-0125,explicit_evaluation_yes,1.0/1,1,1773602eac8037fbea015069d5f15cd2,2024-04-29 14:13:49
 gpt-3.5-turbo-0125,explicit_relevance_no_more_explicit,1.0/1,5,8dddcfc1314f6f671d59a3a90c95e3c0,2024-02-12 08:31:50
+gpt-3.5-turbo-0125,explicit_relevance_no_more_explicit,1.0/1,1,8dddcfc1314f6f671d59a3a90c95e3c0,2024-04-29 14:13:58
 gpt-3.5-turbo-0125,explicit_relevance_no_repeat_instruction,1.0/1,5,1ca6c04890597e4ece0eb8ad632f3f75,2024-02-12 08:31:53
+gpt-3.5-turbo-0125,explicit_relevance_no_repeat_instruction,1.0/1,1,1ca6c04890597e4ece0eb8ad632f3f75,2024-04-29 14:14:01
 gpt-3.5-turbo-0125,explicit_relevance_no_simple,1.0/1,5,bf26b8241de3470cd9a406aea0992fb2,2024-02-12 08:31:46
+gpt-3.5-turbo-0125,explicit_relevance_no_simple,1.0/1,1,bf26b8241de3470cd9a406aea0992fb2,2024-04-29 14:13:57
 gpt-3.5-turbo-0125,explicit_relevance_yes,1.0/1,5,1f7a20371c9b65790b9b8e8be116b246,2024-02-12 08:31:18
+gpt-3.5-turbo-0125,explicit_relevance_yes,1.0/1,1,1f7a20371c9b65790b9b8e8be116b246,2024-04-29 14:13:45
 gpt-3.5-turbo-0613,explicit_evaluation_no,1.0/1,5,d15e0094569f8df146459b50a781fc55,2024-02-10 11:56:05
 gpt-3.5-turbo-0613,explicit_evaluation_yes,1.0/1,5,1773602eac8037fbea015069d5f15cd2,2024-02-10 11:56:08
 gpt-3.5-turbo-0613,explicit_relevance_no_more_explicit,1.0/1,5,8dddcfc1314f6f671d59a3a90c95e3c0,2024-02-10 11:56:42

diff --git a/benchmark/results/implicit_relevance_of_multiple_fragments.csv b/benchmark/results/implicit_relevance_of_multiple_fragments.csv
@@ -38,7 +38,9 @@ code-llama-instruct:7:ggufv2:Q6_K,implicit_relevance_yes,1.0/1,5,f9d749647929fcb
 code-llama-instruct:7:ggufv2:Q8_0,implicit_relevance_no,0.0/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-10 11:49:53
 code-llama-instruct:7:ggufv2:Q8_0,implicit_relevance_yes,1.0/1,5,f9d749647929fcb55321c614a3bf8d20,2024-02-10 11:50:22
 gpt-3.5-turbo-0125,implicit_relevance_no,0.8/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-12 08:31:33
+gpt-3.5-turbo-0125,implicit_relevance_no,1.0/1,1,b24fb31fd761b0f3e308bebd70ce4277,2024-04-29 14:13:53
 gpt-3.5-turbo-0125,implicit_relevance_yes,1.0/1,5,f9d749647929fcb55321c614a3bf8d20,2024-02-12 08:31:42
+gpt-3.5-turbo-0125,implicit_relevance_yes,1.0/1,1,f9d749647929fcb55321c614a3bf8d20,2024-04-29 14:13:56
 gpt-3.5-turbo-0613,implicit_relevance_no,1.0/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-10 11:56:18
 gpt-3.5-turbo-0613,implicit_relevance_yes,1.0/1,5,f9d749647929fcb55321c614a3bf8d20,2024-02-10 11:56:37
 gpt-4-0125-preview,implicit_relevance_no,0.0/1,5,b24fb31fd761b0f3e308bebd70ce4277,2024-02-12 08:38:56

diff --git a/benchmark/test_correctness_medical_usecase.py b/benchmark/test_correctness_medical_usecase.py
@@ -0,0 +1,75 @@
+import inspect
+import re
+
+import pytest
+
+from biochatter._misc import ensure_iterable
+from .conftest import calculate_test_score
+from .benchmark_utils import (
+    skip_if_already_run,
+    get_result_file_path,
+    write_results_to_file,
+)
+
+
+def test_correctness_of_answers(
+    model_name,
+    test_data_correctness,
+    conversation,
+    multiple_testing,
+):
+    yaml_data = test_data_correctness
+    task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
+    # Wieder einkommentieren, wenn benötigt
+    skip_if_already_run(
+        model_name=model_name, task=task, md5_hash=yaml_data["hash"]
+    )
+
+
+    def run_test():
+        conversation.reset()  # needs to be reset for each test
+        [
+            conversation.append_system_message(m)
+            for m in yaml_data["input"]["system_messages"]
+        ]
+        response, _, _ = conversation.query(yaml_data["input"]["prompt"])
+
+        # lower case, remove punctuation
+        response = (
+            response.lower().replace(".", "").replace("?", "").replace("!", "")
+        ).strip()
+
+        print(yaml_data["case"])
+        print(response)
+        # print(get_result_file_path(task))
+
+        # calculate score of correct answers
+        score = []
+
+        # calculate for answers without regex
+        if "regex" not in yaml_data["case"]:
+            score.append(response == yaml_data["expected"]["answer"])
+
+        # calculate for answers with regex
+        else:
+            expected_word_pairs = yaml_data["expected"]["words_in_response"]
+            for pair in expected_word_pairs:
+                regex = "|".join(pair)
+                if re.search(regex, response, re.IGNORECASE):
+                    # print(f"Expected words '{pair}' found in response: {response}")
+                    score.append(True)
+                else:
+                    score.append(False)
+
+        return calculate_test_score(score)
+
+    mean_score, max, n_iterations = multiple_testing(run_test)
+
+    write_results_to_file(
+        model_name,
+        yaml_data["case"],
+        f"{mean_score}/{max}",
+        f"{n_iterations}",
+        yaml_data["hash"],
+        get_result_file_path(task),
+    )
diff --git a/benchmark/test_rag_interpretation.py b/benchmark/test_rag_interpretation.py
@@ -5,7 +5,7 @@
 from biochatter._misc import ensure_iterable
 from .conftest import calculate_test_score
 from .benchmark_utils import (
-    skip_if_already_run,
+    #skip_if_already_run,
     get_result_file_path,
     write_results_to_file,
 )
@@ -19,9 +19,9 @@ def test_explicit_relevance_of_single_fragments(
 ):
     yaml_data = test_data_rag_interpretation
     task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
-    skip_if_already_run(
+    '''skip_if_already_run(
         model_name=model_name, task=task, md5_hash=yaml_data["hash"]
-    )
+    )'''
     if "explicit" not in yaml_data["case"]:
         pytest.skip(
             f"test case {yaml_data['case']} not supported for {task} benchmark"
@@ -67,9 +67,9 @@ def test_implicit_relevance_of_multiple_fragments(
 ):
     yaml_data = test_data_rag_interpretation
     task = f"{inspect.currentframe().f_code.co_name.replace('test_', '')}"
-    skip_if_already_run(
+    '''skip_if_already_run(
         model_name=model_name, task=task, md5_hash=yaml_data["hash"]
-    )
+    )'''
     if "implicit" not in yaml_data["case"]:
         pytest.skip(
             f"test case {yaml_data['case']} not supported for {task} benchmark"

diff --git a/openAIKey.py b/openAIKey.py
@@ -0,0 +1,9 @@
+import os
+from dotenv import load_dotenv
+
+
+from openai import OpenAI
+# delete dots if venv is in project env
+cus_path = os.getcwd() + "../venv/bin/.env"
+load_dotenv(cus_path)
+print(os.getenv("OPENAI_API_KEY"))
diff --git a/test/test_podcast.py → test/no_podcast.py b/test/test_podcast.py → test/no_podcast.py