|
| 1 | +""" |
| 2 | +name: |
| 3 | +AA Omniscience |
| 4 | +
|
| 5 | +dataset: |
| 6 | +ArtificialAnalysis/AA-Omniscience-Public |
| 7 | +
|
| 8 | +abstract: |
| 9 | +benchmark dataset designed to measure a model's ability to both recall factual |
| 10 | +information accurately across domains, and correctly abstain when its knowledge |
| 11 | +is insufficient. AA-Omniscience is characterized by its penalty for incorrect |
| 12 | +guesses, distinct from both accuracy (number of questions answered correctly) |
| 13 | +and hallucination rate (proportion of incorrect guesses when model does not know |
| 14 | +the answer), making it extremely relevant for users to choose a model for their |
| 15 | +next domain specific task. |
| 16 | +
|
| 17 | +paper: |
| 18 | +https://arxiv.org/abs/2511.13029 |
| 19 | +
|
| 20 | +languages: |
| 21 | +english |
| 22 | +
|
| 23 | +starred: |
| 24 | +true |
| 25 | +""" |
| 26 | + |
| 27 | +import re |
| 28 | + |
| 29 | +import numpy as np |
| 30 | +from inspect_ai.dataset import Sample |
| 31 | +from inspect_ai.model import get_model |
| 32 | +from inspect_ai.scorer import Metric, SampleScore, Score, Target, metric, scorer, stderr |
| 33 | +from inspect_ai.solver import TaskState, generate |
| 34 | + |
| 35 | +from lighteval.metrics.metrics import Metrics |
| 36 | +from lighteval.tasks.lighteval_task import LightevalTaskConfig |
| 37 | +from lighteval.tasks.requests import Doc |
| 38 | + |
| 39 | + |
| 40 | +OMNISCIENCE_ANSWER_PROMPT = """ |
| 41 | +You are answering questions about {domain}, and in particular {topic}. |
| 42 | +You will be given a question, answer with JUST the answer (no explanation). |
| 43 | +If you do not know the answer, or you need more context or tools to answer the question, |
| 44 | +be clear about this - it is better that you say this than get the wrong answer. |
| 45 | +""".strip() |
| 46 | + |
| 47 | + |
| 48 | +OMNISCIENCE_GRADER_TEMPLATE = """ |
| 49 | +Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "PARTIAL_ANSWER", "NOT_ATTEMPTED"]. |
| 50 | +First, I will give examples of each grade, and then you will grade a new example. |
| 51 | +
|
| 52 | +CORRECT: The answer fully matches or is equivalent to the gold target |
| 53 | +INCORRECT: The answer contradicts or differs in meaning from the gold target |
| 54 | +PARTIAL_ANSWER: The answer is accurate and nearly complete but not at the correct level of detail |
| 55 | +- For questions that provide more detail than the gold target, the predicted answer should be PARTIAL_ANSWER |
| 56 | +NOT_ATTEMPTED: Used only when the model refuses, omits, or explicitly states it does not know the answer (e.g. empty response), or needs more context or tools to answer the question |
| 57 | +
|
| 58 | +Please note the following things: |
| 59 | +- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. |
| 60 | + - For example, consider "How much did Company Y earn in 2024?" with gold target "100 million". The predicted answer "100.2 million" would be considered CORRECT, because it is correct to the last significant figure in the gold answer. |
| 61 | + - Note that this does NOT APPLY when the answer is a section, version or other numeric identifier, in this case the number must match EXACTLY |
| 62 | +- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. |
| 63 | + - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. |
| 64 | +- Do not punish predicted answers if they omit information that would be clearly inferred from the question, this includes units of measurement. |
| 65 | + - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". |
| 66 | + - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. |
| 67 | + - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. |
| 68 | + - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. |
| 69 | +- Do not punish for typos in people's name if it's clearly the same name. |
| 70 | + - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". |
| 71 | +- For coding questions, if the predicted answer is equivalent to the gold target (e.g. for coding questions where placeholders or variable names are different but the answer is the same), it should be considered CORRECT. |
| 72 | +
|
| 73 | +Here are examples across different topics demonstrating CORRECT, INCORRECT, PARTIAL_ANSWER, and NOT_ATTEMPTED grades: |
| 74 | +
|
| 75 | +Example 1 - CORRECT: |
| 76 | +
|
| 77 | +Question: What acronym denotes the IAD tool that focuses on ensembles of linked action situations? |
| 78 | +Gold target: NAS |
| 79 | +Predicted answer: The acronym is NAS, which stands for Network of Action Situations |
| 80 | +
|
| 81 | +This is CORRECT because the predicted answer contains all the information required by the gold target. |
| 82 | +
|
| 83 | +Example 2 - INCORRECT: |
| 84 | +
|
| 85 | +Question: In the Arkansas Supreme Court case Loveless v. Diehl (1963 decision), which county contained the 79-acre farm at issue? |
| 86 | +Gold target: Faulkner County |
| 87 | +Predicted answer: The 79-acre farm at issue was located in Polk County, Arkansas |
| 88 | +
|
| 89 | +This is INCORRECT because the predicted answer provides a county that contradicts the gold target. |
| 90 | +
|
| 91 | +Example 3 - INCORRECT: |
| 92 | +
|
| 93 | +Question: In which NumPy version (x.y.z) was support for passing an empty list ([]) to the domain parameter of numpy.polynomial.laguerre.Laguerre.fit first introduced? |
| 94 | +Gold target: 1.5.0 |
| 95 | +Predicted answer: 1.5 |
| 96 | +
|
| 97 | +This is INCORRECT because the answer is correct but incomplete - it only provides the major version number. |
| 98 | +
|
| 99 | +Example 4 - NOT_ATTEMPTED: |
| 100 | +
|
| 101 | +Question: Who originally proposed the "fake barns" (barn-façade) thought experiment in epistemology? |
| 102 | +Gold target: Carl Ginet |
| 103 | +Predicted answer: I am not aware of who originally proposed the "fake barns" thought experiment in epistemology, but it could have been Alvin Goldman or Carl Ginet. |
| 104 | +
|
| 105 | +This is NOT_ATTEMPTED because it states that it does not know, even though it has other information on the topic. |
| 106 | +
|
| 107 | +Example 5 - PARTIAL_ANSWER: |
| 108 | +
|
| 109 | +Question: In millions of dollars, how much profit did Company X make in USD in 2024? |
| 110 | +Gold target: 28 |
| 111 | +Predicted answer: 28.1 million |
| 112 | +
|
| 113 | +This is PARTIAL_ANSWER because it does not contradict the gold target but provides more detail than is required. |
| 114 | +
|
| 115 | +Example 6 - CORRECT: |
| 116 | +
|
| 117 | +Question: In CPython 3.8 or later, which -X option value is equivalent to setting the PYTHONPYCACHEPREFIX environment variable? |
| 118 | +Gold target: pycache_prefix=PATH |
| 119 | +Predicted answer: pycache_prefix=DIR |
| 120 | +
|
| 121 | +This is CORRECT because although the two answers are not exactly the same, they are equivalent (the only difference is the name of the variable). |
| 122 | +
|
| 123 | +Example 7 - CORRECT: |
| 124 | +
|
| 125 | +Question: What format is used in Python f-strings for formatting decimal numbers? |
| 126 | +Gold target: .Pf |
| 127 | +Predicted answer: .<precision>f |
| 128 | +
|
| 129 | +This is CORRECT because although the two answers are not exactly the same, they are equivalent. |
| 130 | +
|
| 131 | +Here is a new example. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. |
| 132 | +
|
| 133 | +Question: {question} |
| 134 | +Gold target: {criterion} |
| 135 | +Predicted answer: {answer} |
| 136 | +
|
| 137 | +
|
| 138 | +Grade the predicted answer of this new question as one of: |
| 139 | +A: CORRECT |
| 140 | +B: INCORRECT |
| 141 | +C: PARTIAL_ANSWER |
| 142 | +D: NOT_ATTEMPTED |
| 143 | +
|
| 144 | +Just return the letters "A", "B", "C", or "D", with no text around it. |
| 145 | +""".strip() |
| 146 | + |
| 147 | + |
| 148 | +@metric |
| 149 | +def hallucination_rate() -> Metric: |
| 150 | + """percentage incorrect of incorrect/abstentions |
| 151 | +
|
| 152 | + Returns: |
| 153 | + hallucination rate metric |
| 154 | + """ |
| 155 | + |
| 156 | + def metric(scores: list[SampleScore]) -> float: |
| 157 | + incorrect = [score for score in scores if score.score.explanation in ["B", "C"]] |
| 158 | + not_attempted = [score for score in scores if score.score.explanation == "D"] |
| 159 | + |
| 160 | + return (len(incorrect) + len(not_attempted)) / len(scores) |
| 161 | + |
| 162 | + return metric |
| 163 | + |
| 164 | + |
| 165 | +@metric |
| 166 | +def accuracy() -> Metric: |
| 167 | + """percentage correct of correct/abstentions |
| 168 | +
|
| 169 | + Returns: |
| 170 | + accuracy metric |
| 171 | + """ |
| 172 | + |
| 173 | + def metric(scores: list[SampleScore]) -> float: |
| 174 | + correct = [score for score in scores if score.score.explanation == "A"] |
| 175 | + |
| 176 | + return len(correct) / len(scores) |
| 177 | + |
| 178 | + return metric |
| 179 | + |
| 180 | + |
| 181 | +@metric |
| 182 | +def omniscience_index() -> Metric: |
| 183 | + """percentage correct of correct/abstentions |
| 184 | +
|
| 185 | + Returns: |
| 186 | + accuracy metric |
| 187 | + """ |
| 188 | + |
| 189 | + def metric(scores: list[SampleScore]) -> float: |
| 190 | + correct = [1 for score in scores if score.score.explanation == "A"] |
| 191 | + incorrect = [-1 for score in scores if score.score.explanation == "B"] |
| 192 | + partial = [-1 for score in scores if score.score.explanation == "C"] |
| 193 | + not_attempted = [0 for score in scores if score.score.explanation == "D"] |
| 194 | + |
| 195 | + return np.mean(correct + incorrect + partial + not_attempted).item() |
| 196 | + |
| 197 | + return metric |
| 198 | + |
| 199 | + |
| 200 | +@scorer(metrics=[omniscience_index(), accuracy(), hallucination_rate(), stderr()]) |
| 201 | +def aa_omniscience_scorer(ignore_case: bool = True): |
| 202 | + async def score(state: TaskState, target: Target): |
| 203 | + grader_model = get_model("hf-inference-providers/openai/gpt-oss-20b") |
| 204 | + answer = state.output.completion |
| 205 | + target = target.text |
| 206 | + |
| 207 | + score_prompt = OMNISCIENCE_GRADER_TEMPLATE.format( |
| 208 | + question=state.input_text, |
| 209 | + answer=answer, |
| 210 | + criterion=target, |
| 211 | + ) |
| 212 | + |
| 213 | + result = await grader_model.generate(score_prompt) |
| 214 | + |
| 215 | + # extract the grade |
| 216 | + match = re.search(r"^[A-D]$", result.completion) |
| 217 | + |
| 218 | + if not match: |
| 219 | + return Score( |
| 220 | + value="D", answer=answer, explanation="Grade not found in model output: " + f"{result.completion}" |
| 221 | + ) |
| 222 | + |
| 223 | + return Score(value=match.group(0), answer=answer, explanation=result.completion) |
| 224 | + |
| 225 | + return score |
| 226 | + |
| 227 | + |
| 228 | +def record_to_sample(record): |
| 229 | + query = ( |
| 230 | + OMNISCIENCE_ANSWER_PROMPT.format(domain=record["domain"], topic=record["topic"]) + "\n\n" + record["question"] |
| 231 | + ) |
| 232 | + target = record["answer"] |
| 233 | + return Sample(input=query, target=target, metadata={"domain": record["domain"], "topic": record["topic"]}) |
| 234 | + |
| 235 | + |
| 236 | +def aa_omniscience_prompt(record): |
| 237 | + query = OMNISCIENCE_ANSWER_PROMPT.format(domain=record["domain"], topic=record["topic"]) |
| 238 | + query += "\n\n" + record["question"] |
| 239 | + |
| 240 | + return Doc( |
| 241 | + task_name="aa_omniscience", |
| 242 | + query=query, |
| 243 | + choices=[record["answer"]], |
| 244 | + gold_index=0, |
| 245 | + ) |
| 246 | + |
| 247 | + |
| 248 | +aa_omniscience = LightevalTaskConfig( |
| 249 | + name="aa_omniscience", |
| 250 | + prompt_function=aa_omniscience_prompt, |
| 251 | + hf_repo="ArtificialAnalysis/AA-Omniscience-Public", |
| 252 | + hf_subset="default", |
| 253 | + hf_avail_splits=["train"], |
| 254 | + evaluation_splits=["train"], |
| 255 | + few_shots_split=None, |
| 256 | + few_shots_select=None, |
| 257 | + generation_size=100, |
| 258 | + metrics=[Metrics.exact_match], |
| 259 | + stop_sequence=None, |
| 260 | + version=0, |
| 261 | + sample_fields=record_to_sample, |
| 262 | + solver=[generate(cache=True)], |
| 263 | + scorer=aa_omniscience_scorer(), |
| 264 | +) |
| 265 | + |
| 266 | + |
| 267 | +TASKS_TABLE = [aa_omniscience] |
0 commit comments