Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion browsecomp_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import re
import pandas
from . import common
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

# from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_model_predictions.py#L11
QUERY_TEMPLATE = """
Expand Down
2 changes: 1 addition & 1 deletion common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import requests
from tqdm import tqdm

from .types import EvalResult, Message, SamplerBase, SingleEvalResult
from .eval_types import EvalResult, Message, SamplerBase, SingleEvalResult

QUERY_TEMPLATE_MULTICHOICE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
Expand Down
2 changes: 1 addition & 1 deletion drop_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from . import common
from .common import ANSWER_PATTERN, HTML_JINJA
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

"""
From here through _normalize_answer was originally copied from:
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion gpqa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from . import common
from .common import ANSWER_PATTERN_MULTICHOICE, HTML_JINJA, format_multichoice_question
from .types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult


class GPQAEval(Eval):
Expand Down
2 changes: 1 addition & 1 deletion humaneval_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from . import common
from .common import HTML_JINJA
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult


def evaluate_functional_correctness(
Expand Down
2 changes: 1 addition & 1 deletion math_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from . import common
from .common import ANSWER_PATTERN, HTML_JINJA, check_equality
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

QUERY_TEMPLATE = """
Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
Expand Down
2 changes: 1 addition & 1 deletion mgsm_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from . import common
from .mmlu_eval import HTML_JINJA
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"]
LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"]
Expand Down
2 changes: 1 addition & 1 deletion mmlu_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
normalize_extracted_answer,
normalize_response,
)
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

subject2category = {
"abstract_algebra": "stem",
Expand Down
2 changes: 1 addition & 1 deletion sampler/chat_completion_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import openai
from openai import OpenAI

from ..types import MessageList, SamplerBase
from ..eval_types import MessageList, SamplerBase

OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant."
OPENAI_SYSTEM_MESSAGE_CHATGPT = (
Expand Down
2 changes: 1 addition & 1 deletion sampler/claude_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import anthropic

from ..types import MessageList, SamplerBase
from ..eval_types import MessageList, SamplerBase

CLAUDE_SYSTEM_MESSAGE_LMSYS = (
"The assistant is Claude, created by Anthropic. The current date is "
Expand Down
2 changes: 1 addition & 1 deletion sampler/o_chat_completion_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import openai
from openai import OpenAI

from ..types import MessageList, SamplerBase
from ..eval_types import MessageList, SamplerBase


class OChatCompletionSampler(SamplerBase):
Expand Down
2 changes: 1 addition & 1 deletion sampler/responses_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import openai
from openai import OpenAI

from ..types import MessageList, SamplerBase
from ..eval_types import MessageList, SamplerBase


class ResponsesSampler(SamplerBase):
Expand Down
2 changes: 1 addition & 1 deletion simpleqa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import re
import pandas
from . import common
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

GRADER_TEMPLATE = """
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
Expand Down