diff --git a/browsecomp_eval.py b/browsecomp_eval.py index e246d52f..1fbaebcf 100644 --- a/browsecomp_eval.py +++ b/browsecomp_eval.py @@ -89,7 +89,7 @@ def grade_sample(self, question: str, correct_answer: str, response: str) -> str grading_response = self.grader_model(prompt_messages) match = re.search(r"correct: (yes|no)", grading_response) - return match.group(0) if match else "no" # Default to "no" if no match + return match.group(1) if match else "no" # Default to "no" if no match def __call__(self, sampler: SamplerBase) -> EvalResult: def fn(row: dict):