From 18136e786c976196ff929d670a18faf7308a4241 Mon Sep 17 00:00:00 2001
From: Jan Kessler <jakessle@uni-mainz.de>
Date: Sat, 19 Apr 2025 16:52:16 +0200
Subject: [PATCH] fix import collision for types

---
 browsecomp_eval.py                   | 2 +-
 common.py                            | 2 +-
 drop_eval.py                         | 2 +-
 types.py => eval_types.py            | 0
 gpqa_eval.py                         | 2 +-
 humaneval_eval.py                    | 2 +-
 math_eval.py                         | 2 +-
 mgsm_eval.py                         | 2 +-
 mmlu_eval.py                         | 2 +-
 sampler/chat_completion_sampler.py   | 2 +-
 sampler/claude_sampler.py            | 2 +-
 sampler/o_chat_completion_sampler.py | 2 +-
 sampler/responses_sampler.py         | 2 +-
 simpleqa_eval.py                     | 2 +-
 14 files changed, 13 insertions(+), 13 deletions(-)
 rename types.py => eval_types.py (100%)

diff --git a/browsecomp_eval.py b/browsecomp_eval.py
index e246d52f..3c74399a 100644
--- a/browsecomp_eval.py
+++ b/browsecomp_eval.py
@@ -10,7 +10,7 @@
 import re
 import pandas
 from . import common
-from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 # from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_model_predictions.py#L11
 QUERY_TEMPLATE = """
diff --git a/common.py b/common.py
index b6b4c0e1..9f192123 100644
--- a/common.py
+++ b/common.py
@@ -9,7 +9,7 @@
 import requests
 from tqdm import tqdm
 
-from .types import EvalResult, Message, SamplerBase, SingleEvalResult
+from .eval_types import EvalResult, Message, SamplerBase, SingleEvalResult
 
 QUERY_TEMPLATE_MULTICHOICE = """
 Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
diff --git a/drop_eval.py b/drop_eval.py
index 27918e5b..2f805ba1 100644
--- a/drop_eval.py
+++ b/drop_eval.py
@@ -16,7 +16,7 @@
 
 from . import common
 from .common import ANSWER_PATTERN, HTML_JINJA
-from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 """
 From here through _normalize_answer was originally copied from:
diff --git a/types.py b/eval_types.py
similarity index 100%
rename from types.py
rename to eval_types.py
diff --git a/gpqa_eval.py b/gpqa_eval.py
index 21c717ef..7e8e312b 100644
--- a/gpqa_eval.py
+++ b/gpqa_eval.py
@@ -11,7 +11,7 @@
 
 from . import common
 from .common import ANSWER_PATTERN_MULTICHOICE, HTML_JINJA, format_multichoice_question
-from .types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult
 
 
 class GPQAEval(Eval):
diff --git a/humaneval_eval.py b/humaneval_eval.py
index 75eab56a..a80706e3 100644
--- a/humaneval_eval.py
+++ b/humaneval_eval.py
@@ -20,7 +20,7 @@
 
 from . import common
 from .common import HTML_JINJA
-from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 
 def evaluate_functional_correctness(
diff --git a/math_eval.py b/math_eval.py
index 4328dcdf..43f09d57 100644
--- a/math_eval.py
+++ b/math_eval.py
@@ -12,7 +12,7 @@
 
 from . import common
 from .common import ANSWER_PATTERN, HTML_JINJA, check_equality
-from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 QUERY_TEMPLATE = """
 Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
diff --git a/mgsm_eval.py b/mgsm_eval.py
index 674ac964..9a503e99 100644
--- a/mgsm_eval.py
+++ b/mgsm_eval.py
@@ -10,7 +10,7 @@
 
 from . import common
 from .mmlu_eval import HTML_JINJA
-from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"]
 LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"]
diff --git a/mmlu_eval.py b/mmlu_eval.py
index 9423c660..6065b818 100644
--- a/mmlu_eval.py
+++ b/mmlu_eval.py
@@ -18,7 +18,7 @@
     normalize_extracted_answer,
     normalize_response,
 )
-from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 subject2category = {
     "abstract_algebra": "stem",
diff --git a/sampler/chat_completion_sampler.py b/sampler/chat_completion_sampler.py
index d75ce918..356aba29 100644
--- a/sampler/chat_completion_sampler.py
+++ b/sampler/chat_completion_sampler.py
@@ -5,7 +5,7 @@
 import openai
 from openai import OpenAI
 
-from ..types import MessageList, SamplerBase
+from ..eval_types import MessageList, SamplerBase
 
 OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant."
 OPENAI_SYSTEM_MESSAGE_CHATGPT = (
diff --git a/sampler/claude_sampler.py b/sampler/claude_sampler.py
index cf61b441..383c5d3b 100644
--- a/sampler/claude_sampler.py
+++ b/sampler/claude_sampler.py
@@ -2,7 +2,7 @@
 
 import anthropic
 
-from ..types import MessageList, SamplerBase
+from ..eval_types import MessageList, SamplerBase
 
 CLAUDE_SYSTEM_MESSAGE_LMSYS = (
     "The assistant is Claude, created by Anthropic. The current date is "
diff --git a/sampler/o_chat_completion_sampler.py b/sampler/o_chat_completion_sampler.py
index 718d02a8..ca42ee33 100644
--- a/sampler/o_chat_completion_sampler.py
+++ b/sampler/o_chat_completion_sampler.py
@@ -4,7 +4,7 @@
 import openai
 from openai import OpenAI
 
-from ..types import MessageList, SamplerBase
+from ..eval_types import MessageList, SamplerBase
 
 
 class OChatCompletionSampler(SamplerBase):
diff --git a/sampler/responses_sampler.py b/sampler/responses_sampler.py
index 1e49b21c..8fb41aa4 100644
--- a/sampler/responses_sampler.py
+++ b/sampler/responses_sampler.py
@@ -6,7 +6,7 @@
 import openai
 from openai import OpenAI
 
-from ..types import MessageList, SamplerBase
+from ..eval_types import MessageList, SamplerBase
 
 
 class ResponsesSampler(SamplerBase):
diff --git a/simpleqa_eval.py b/simpleqa_eval.py
index 2a1390a0..780aaeaf 100644
--- a/simpleqa_eval.py
+++ b/simpleqa_eval.py
@@ -8,7 +8,7 @@
 import re
 import pandas
 from . import common
-from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
+from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 GRADER_TEMPLATE = """
 Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].