From 58d24d1b65b95ed96d57805604aca7adca49861d Mon Sep 17 00:00:00 2001 From: John Yang Date: Mon, 1 Apr 2024 12:43:59 -0400 Subject: [PATCH] Add install fail logging --- swebench/metrics/constants.py | 29 +++++++++++++++++ swebench/metrics/conversion.py | 13 +++++--- swebench/metrics/getters.py | 30 +++++++----------- swebench/metrics/log_parsers.py | 9 +----- swebench/metrics/metrics.py | 16 +++------- swebench/metrics/monitor.py | 15 ++++++--- swebench/metrics/report.py | 55 ++++++++++++++++++++------------- 7 files changed, 99 insertions(+), 68 deletions(-) create mode 100644 swebench/metrics/constants.py diff --git a/swebench/metrics/constants.py b/swebench/metrics/constants.py new file mode 100644 index 00000000..654435b0 --- /dev/null +++ b/swebench/metrics/constants.py @@ -0,0 +1,29 @@ +from enum import Enum + +# Evaluation Log Constants +APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed" +APPLY_PATCH_PASS = ">>>>> Applied Patch" +INSTALL_FAIL = ">>>>> Init Failed" +INSTALL_PASS = ">>>>> Init Succeeded" +RESET_FAILED = ">>>>> Reset Failed" +TESTS_ERROR = ">>>>> Tests Errored" +TESTS_TIMEOUT = ">>>>> Tests Timed Out" + +# Result Categories +FAIL_TO_PASS = "FAIL_TO_PASS" +FAIL_TO_FAIL = "FAIL_TO_FAIL" +PASS_TO_PASS = "PASS_TO_PASS" +PASS_TO_FAIL = "PASS_TO_FAIL" + +# Test Status Enum +class TestStatus(Enum): + FAILED = "FAILED" + PASSED = "PASSED" + SKIPPED = "SKIPPED" + ERROR = "ERROR" + +# Resolved Status Enum +class ResolvedStatus(Enum): + NO = "RESOLVED_NO" + PARTIAL = "RESOLVED_PARTIAL" + FULL = "RESOLVED_FULL" diff --git a/swebench/metrics/conversion.py b/swebench/metrics/conversion.py index d546dff5..0d158039 100644 --- a/swebench/metrics/conversion.py +++ b/swebench/metrics/conversion.py @@ -1,17 +1,20 @@ import json, os -from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus -from swebench.metrics.getters import ( - get_file_name_from_lp, - get_repo_from_lp, - log_path_to_sms, +from swebench.metrics.constants import ( FAIL_TO_PASS, FAIL_TO_FAIL, PASS_TO_PASS, PASS_TO_FAIL, + TestStatus, +) +from swebench.metrics.getters import ( + get_file_name_from_lp, + get_repo_from_lp, + log_path_to_sms, test_failed, test_passed, ) +from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER def convert_log_to_ground_truth( diff --git a/swebench/metrics/getters.py b/swebench/metrics/getters.py index b980475a..2c3ab3b6 100644 --- a/swebench/metrics/getters.py +++ b/swebench/metrics/getters.py @@ -1,22 +1,14 @@ import re +from swebench.metrics.constants import ( + APPLY_PATCH_FAIL, + APPLY_PATCH_PASS, + RESET_FAILED, + TESTS_ERROR, + TESTS_TIMEOUT, +) from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus - - -# Evaluation Log Constants -APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed" -APPLY_PATCH_PASS = ">>>>> Applied Patch" -INSTALL_FAIL = ">>>>> Init Failed" -INSTALL_PASS = ">>>>> Init Succeeded" -RESET_FAILED = ">>>>> Reset Failed" -TESTS_TIMEOUT = ">>>>> Tests Timed Out" -TESTS_ERROR = ">>>>> Tests Errored" - -# Result Categories -FAIL_TO_PASS = "FAIL_TO_PASS" -FAIL_TO_FAIL = "FAIL_TO_FAIL" -PASS_TO_PASS = "PASS_TO_PASS" -PASS_TO_FAIL = "PASS_TO_FAIL" +from typing import Tuple def get_diffs(sm_1: dict, sm_2: dict) -> dict: @@ -41,7 +33,7 @@ def get_diffs(sm_1: dict, sm_2: dict) -> dict: return diff_map -def get_logs_eval(log_fp: str) -> (dict, bool): +def get_logs_eval(log_fp: str) -> Tuple[dict, bool]: """ Retrieve evaluation results for a task instance from its corresponding log file @@ -65,7 +57,7 @@ def get_logs_eval(log_fp: str) -> (dict, bool): return log_parser(content), True -def get_logs_gold(log_fp: str) -> (str, str): +def get_logs_gold(log_fp: str) -> Tuple[str, str]: """ Retrieve pre-patch, post-patch test logs from a validation log file @@ -92,7 +84,7 @@ def get_logs_gold(log_fp: str) -> (str, str): get_repo_from_lp = lambda x: get_id_from_lp(x).rsplit("-", 1)[0].replace("__", "/") -def log_path_to_sms(log_fp: str, log_parser) -> (list, bool): +def log_path_to_sms(log_fp: str, log_parser) -> Tuple[list, bool]: """ Wrapper for getting log data from log_parser file diff --git a/swebench/metrics/log_parsers.py b/swebench/metrics/log_parsers.py index cdb4f3fa..13869918 100644 --- a/swebench/metrics/log_parsers.py +++ b/swebench/metrics/log_parsers.py @@ -1,13 +1,6 @@ import re -from enum import Enum - - -class TestStatus(Enum): - FAILED = "FAILED" - PASSED = "PASSED" - SKIPPED = "SKIPPED" - ERROR = "ERROR" +from swebench.metrics.constants import TestStatus def parse_log_pytest(log: str) -> dict: diff --git a/swebench/metrics/metrics.py b/swebench/metrics/metrics.py index cfc4ba8c..eef133a1 100644 --- a/swebench/metrics/metrics.py +++ b/swebench/metrics/metrics.py @@ -1,17 +1,11 @@ -from enum import Enum from statistics import mean -from swebench.metrics.getters import ( - FAIL_TO_FAIL, FAIL_TO_PASS, - PASS_TO_FAIL, PASS_TO_PASS, +from swebench.metrics.constants import ( + FAIL_TO_PASS, + PASS_TO_PASS, + ResolvedStatus, ) -class ResolvedStatus(Enum): - NO = "RESOLVED_NO" - PARTIAL = "RESOLVED_PARTIAL" - FULL = "RESOLVED_FULL" - - def compute_fail_to_pass(report: dict) -> float: """ Compute fail-to-pass metric. Accepts single report as argument. @@ -94,4 +88,4 @@ def get_resolution_status(report: dict) -> str: elif f2p < 1 and f2p > 0 and p2p == 1: return ResolvedStatus.PARTIAL.value else: - return ResolvedStatus.NO.value \ No newline at end of file + return ResolvedStatus.NO.value diff --git a/swebench/metrics/monitor.py b/swebench/metrics/monitor.py index 3aa2bc89..dd047ca4 100644 --- a/swebench/metrics/monitor.py +++ b/swebench/metrics/monitor.py @@ -1,16 +1,23 @@ import glob import os +from swebench.metrics.constants import ( + APPLY_PATCH_FAIL, + APPLY_PATCH_PASS, + TESTS_TIMEOUT +) from swebench.metrics.getters import ( - log_path_to_sms, get_diffs, get_repo_from_lp, - APPLY_PATCH_FAIL, APPLY_PATCH_PASS, TESTS_TIMEOUT + log_path_to_sms, + get_diffs, + get_repo_from_lp, ) from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER +from typing import Tuple def monitor_validation( path_to_logs: str, log_prefix: str = None -) -> (list, list, list, list): +) -> Tuple[list, list, list, list]: """ Check log files generated from a `check_instances` run to see how many instances were successfully installed and/or tested. @@ -79,7 +86,7 @@ def monitor_validation( return failed_install, corrupt_test_patch, corrupt_patch, timeout, success -def monitor_logs_same_diff(log_dir: str, repo: str = None) -> (list, list): +def monitor_logs_same_diff(log_dir: str, repo: str = None) -> Tuple[list, list]: """ Given a log directory and repo, return a list of logs where pre-test and post-test logs are same/different diff --git a/swebench/metrics/report.py b/swebench/metrics/report.py index f8e3962a..7856b5e5 100644 --- a/swebench/metrics/report.py +++ b/swebench/metrics/report.py @@ -1,25 +1,32 @@ import glob, json, os from collections import Counter -from swebench.metrics.getters import ( - get_file_name_from_lp, - get_logs_eval, - get_id_from_lp, +from swebench.harness.constants import ( + INSTALL_FAIL, + KEY_INSTANCE_ID, +) +from swebench.metrics.constants import ( FAIL_TO_FAIL, FAIL_TO_PASS, PASS_TO_FAIL, PASS_TO_PASS, +) +from swebench.metrics.getters import ( + get_file_name_from_lp, + get_logs_eval, + get_id_from_lp, test_failed, test_passed, ) -from swebench.metrics.log_parsers import TestStatus from swebench.metrics.metrics import ( compute_fail_to_pass_unweighted, compute_fail_to_pass_weighted, compute_pass_to_pass_unweighted, compute_pass_to_pass_weighted, get_resolution_status, + ResolvedStatus, ) +from typing import Tuple ### MARK - Eval Report Generation @@ -119,7 +126,7 @@ def get_eval_reports_for_logs( swe_bench_tasks: str, callback: callable = None, verbose: bool = False, -) -> (dict, dict): +) -> Tuple[dict, dict]: """ Wrapper for getting eval report for a list of evaluation log paths. @@ -135,7 +142,7 @@ def get_eval_reports_for_logs( reports_patch_success = {} reports_patch_failure = {} eval_refs = json.load(open(swe_bench_tasks)) - eval_refs = {t['instance_id']: t for t in eval_refs} + eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs} for eval_log in eval_logs: # Remove task instances that do not satisfy callback @@ -194,7 +201,7 @@ def get_model_eval_summary( eval_dir: str, swe_bench_tasks: str, repo: str = None, -): +) -> dict: """ Generate a summary of model evaluation results. @@ -213,7 +220,7 @@ def get_model_eval_summary( # Filter by repo if provided criteria_eval_sm = None if repo is not None: - criteria_pred = lambda pred: repo in pred["instance_id"] + criteria_pred = lambda pred: repo in pred[KEY_INSTANCE_ID] criteria_eval_sm = lambda eval_log: repo in eval_log preds = [x for x in preds if criteria_pred(x)] @@ -257,7 +264,7 @@ def get_model_eval_summary( def get_model_report( model: str, predictions_path: str, swe_bench_tasks: str, log_dir: str -): +) -> dict: """ Generate a report of model evaluation results from predictions, task instances, and evaluation logs. @@ -271,8 +278,8 @@ def get_model_report( report_map (dict): map of repo to report """ eval_refs = json.load(open(swe_bench_tasks)) - eval_refs = [{key: t[key] for key in ["instance_id", "FAIL_TO_PASS", "PASS_TO_PASS"]} for t in eval_refs] - eval_refs = {t['instance_id']: t for t in eval_refs} + eval_refs = [{key: t[key] for key in [KEY_INSTANCE_ID, FAIL_TO_PASS, PASS_TO_PASS]} for t in eval_refs] + eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs} # Get predictions predictions = [] @@ -286,37 +293,43 @@ def get_model_report( # Iterate through predictions for p in predictions: - repo = p["instance_id"].split(".")[0].rsplit("-", 1)[0].replace("__", "/") + repo = p[KEY_INSTANCE_ID].split(".")[0].rsplit("-", 1)[0].replace("__", "/") if repo not in report_map: report_map[repo] = { "none": [], "generated": [], "with_logs": [], + "install_fail": [], "applied": [], "resolved": [], } # Check if the model patch exists if p["model_patch"] == None: - report_map[repo]["none"].append(p['instance_id']) + report_map[repo]["none"].append(p[KEY_INSTANCE_ID]) continue - report_map[repo]["generated"].append(p['instance_id']) + report_map[repo]["generated"].append(p[KEY_INSTANCE_ID]) # Get log file - log_path = os.path.join(log_dir, f"{p['instance_id']}.{model}.eval.log") + log_path = os.path.join(log_dir, f"{p[KEY_INSTANCE_ID]}.{model}.eval.log") if not os.path.exists(log_path): continue - report_map[repo]["with_logs"].append(p['instance_id']) + report_map[repo]["with_logs"].append(p[KEY_INSTANCE_ID]) + + # Check if install succeeded + if INSTALL_FAIL in open(log_path).read(): + report_map[repo]["install_fail"].append(p[KEY_INSTANCE_ID]) + continue # Get evaluation logs eval_sm, found = get_logs_eval(log_path) if not found: continue - report_map[repo]["applied"].append(p['instance_id']) + report_map[repo]["applied"].append(p[KEY_INSTANCE_ID]) - report = get_eval_report(eval_sm, eval_refs[p["instance_id"]]) - if get_resolution_status(report) == "RESOLVED_FULL": - report_map[repo]["resolved"].append(p['instance_id']) + report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]]) + if get_resolution_status(report) == ResolvedStatus.FULL.value: + report_map[repo]["resolved"].append(p[KEY_INSTANCE_ID]) return report_map