From 18136e786c976196ff929d670a18faf7308a4241 Mon Sep 17 00:00:00 2001 From: Jan Kessler Date: Sat, 19 Apr 2025 16:52:16 +0200 Subject: [PATCH] fix import collision for types --- browsecomp_eval.py | 2 +- common.py | 2 +- drop_eval.py | 2 +- types.py => eval_types.py | 0 gpqa_eval.py | 2 +- humaneval_eval.py | 2 +- math_eval.py | 2 +- mgsm_eval.py | 2 +- mmlu_eval.py | 2 +- sampler/chat_completion_sampler.py | 2 +- sampler/claude_sampler.py | 2 +- sampler/o_chat_completion_sampler.py | 2 +- sampler/responses_sampler.py | 2 +- simpleqa_eval.py | 2 +- 14 files changed, 13 insertions(+), 13 deletions(-) rename types.py => eval_types.py (100%) diff --git a/browsecomp_eval.py b/browsecomp_eval.py index e246d52f..3c74399a 100644 --- a/browsecomp_eval.py +++ b/browsecomp_eval.py @@ -10,7 +10,7 @@ import re import pandas from . import common -from .types import Eval, EvalResult, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult # from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_model_predictions.py#L11 QUERY_TEMPLATE = """ diff --git a/common.py b/common.py index b6b4c0e1..9f192123 100644 --- a/common.py +++ b/common.py @@ -9,7 +9,7 @@ import requests from tqdm import tqdm -from .types import EvalResult, Message, SamplerBase, SingleEvalResult +from .eval_types import EvalResult, Message, SamplerBase, SingleEvalResult QUERY_TEMPLATE_MULTICHOICE = """ Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. diff --git a/drop_eval.py b/drop_eval.py index 27918e5b..2f805ba1 100644 --- a/drop_eval.py +++ b/drop_eval.py @@ -16,7 +16,7 @@ from . import common from .common import ANSWER_PATTERN, HTML_JINJA -from .types import Eval, EvalResult, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult """ From here through _normalize_answer was originally copied from: diff --git a/types.py b/eval_types.py similarity index 100% rename from types.py rename to eval_types.py diff --git a/gpqa_eval.py b/gpqa_eval.py index 21c717ef..7e8e312b 100644 --- a/gpqa_eval.py +++ b/gpqa_eval.py @@ -11,7 +11,7 @@ from . import common from .common import ANSWER_PATTERN_MULTICHOICE, HTML_JINJA, format_multichoice_question -from .types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult class GPQAEval(Eval): diff --git a/humaneval_eval.py b/humaneval_eval.py index 75eab56a..a80706e3 100644 --- a/humaneval_eval.py +++ b/humaneval_eval.py @@ -20,7 +20,7 @@ from . import common from .common import HTML_JINJA -from .types import Eval, EvalResult, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult def evaluate_functional_correctness( diff --git a/math_eval.py b/math_eval.py index 4328dcdf..43f09d57 100644 --- a/math_eval.py +++ b/math_eval.py @@ -12,7 +12,7 @@ from . import common from .common import ANSWER_PATTERN, HTML_JINJA, check_equality -from .types import Eval, EvalResult, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult QUERY_TEMPLATE = """ Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem. diff --git a/mgsm_eval.py b/mgsm_eval.py index 674ac964..9a503e99 100644 --- a/mgsm_eval.py +++ b/mgsm_eval.py @@ -10,7 +10,7 @@ from . import common from .mmlu_eval import HTML_JINJA -from .types import Eval, EvalResult, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"] LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"] diff --git a/mmlu_eval.py b/mmlu_eval.py index 9423c660..6065b818 100644 --- a/mmlu_eval.py +++ b/mmlu_eval.py @@ -18,7 +18,7 @@ normalize_extracted_answer, normalize_response, ) -from .types import Eval, EvalResult, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult subject2category = { "abstract_algebra": "stem", diff --git a/sampler/chat_completion_sampler.py b/sampler/chat_completion_sampler.py index d75ce918..356aba29 100644 --- a/sampler/chat_completion_sampler.py +++ b/sampler/chat_completion_sampler.py @@ -5,7 +5,7 @@ import openai from openai import OpenAI -from ..types import MessageList, SamplerBase +from ..eval_types import MessageList, SamplerBase OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant." OPENAI_SYSTEM_MESSAGE_CHATGPT = ( diff --git a/sampler/claude_sampler.py b/sampler/claude_sampler.py index cf61b441..383c5d3b 100644 --- a/sampler/claude_sampler.py +++ b/sampler/claude_sampler.py @@ -2,7 +2,7 @@ import anthropic -from ..types import MessageList, SamplerBase +from ..eval_types import MessageList, SamplerBase CLAUDE_SYSTEM_MESSAGE_LMSYS = ( "The assistant is Claude, created by Anthropic. The current date is " diff --git a/sampler/o_chat_completion_sampler.py b/sampler/o_chat_completion_sampler.py index 718d02a8..ca42ee33 100644 --- a/sampler/o_chat_completion_sampler.py +++ b/sampler/o_chat_completion_sampler.py @@ -4,7 +4,7 @@ import openai from openai import OpenAI -from ..types import MessageList, SamplerBase +from ..eval_types import MessageList, SamplerBase class OChatCompletionSampler(SamplerBase): diff --git a/sampler/responses_sampler.py b/sampler/responses_sampler.py index 1e49b21c..8fb41aa4 100644 --- a/sampler/responses_sampler.py +++ b/sampler/responses_sampler.py @@ -6,7 +6,7 @@ import openai from openai import OpenAI -from ..types import MessageList, SamplerBase +from ..eval_types import MessageList, SamplerBase class ResponsesSampler(SamplerBase): diff --git a/simpleqa_eval.py b/simpleqa_eval.py index 2a1390a0..780aaeaf 100644 --- a/simpleqa_eval.py +++ b/simpleqa_eval.py @@ -8,7 +8,7 @@ import re import pandas from . import common -from .types import Eval, EvalResult, SamplerBase, SingleEvalResult +from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult GRADER_TEMPLATE = """ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].