From d2b85ea1fdc89eb07cafdeec5b1061086615796c Mon Sep 17 00:00:00 2001 From: John Yang Date: Thu, 28 Mar 2024 12:38:22 -0400 Subject: [PATCH 1/6] Ported over necessary pypi setup files + reorganized directory --- .gitignore | 8 ++- LICENSE.md => LICENSE | 0 build_deploy.sh | 6 ++ pyproject.toml | 3 + setup.cfg | 3 + setup.py | 40 +++++++++++++ swebench/__init__.py | 60 +++++++++++++++++++ {collect => swebench/collect}/README.md | 0 .../collect}/build_dataset.py | 0 .../collect}/build_dataset_ft.py | 0 .../collect}/cleanup/delete_gh_workflows.py | 0 .../collect}/cleanup/remove_envs.py | 0 .../collect}/get_tasks_pipeline.py | 0 {collect => swebench/collect}/get_top_pypi.py | 0 .../collect}/make_lite/README.md | 0 .../collect}/make_lite/criteria.py | 0 .../collect}/make_lite/make_lite.py | 0 .../collect}/make_repo/call_make_repo.py | 0 .../collect}/make_repo/make_repo.sh | 0 {collect => swebench/collect}/print_pulls.py | 0 .../collect}/run_build_dataset_ft.sh | 0 .../collect}/run_get_tasks_pipeline.sh | 0 {collect => swebench/collect}/utils.py | 0 {harness => swebench/harness}/README.md | 0 {harness => swebench/harness}/constants.py | 0 .../harness}/context_manager.py | 0 .../harness}/engine_evaluation.py | 0 .../harness}/engine_validation.py | 0 .../harness}/run_evaluation.py | 0 .../harness}/run_evaluation.sh | 0 .../harness}/run_validation.sh | 0 {harness => swebench/harness}/utils.py | 0 {metrics => swebench/metrics}/README.md | 0 {metrics => swebench/metrics}/conversion.py | 0 {metrics => swebench/metrics}/getters.py | 0 {metrics => swebench/metrics}/log_parsers.py | 0 {metrics => swebench/metrics}/metrics.py | 0 {metrics => swebench/metrics}/monitor.py | 0 {metrics => swebench/metrics}/report.py | 0 {versioning => swebench/versioning}/README.md | 0 .../versioning}/constants.py | 0 .../extract_web/get_versions_astropy.py | 0 .../extract_web/get_versions_matplotlib.py | 0 .../extract_web/get_versions_pvlib-python.py | 0 .../extract_web/get_versions_pydicom.py | 0 .../extract_web/get_versions_sqlfluff.py | 0 .../extract_web/get_versions_xarray.py | 0 .../versioning}/get_versions.py | 0 .../versioning}/run_get_versions.sh | 0 {versioning => swebench/versioning}/utils.py | 0 50 files changed, 117 insertions(+), 3 deletions(-) rename LICENSE.md => LICENSE (100%) create mode 100755 build_deploy.sh create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 swebench/__init__.py rename {collect => swebench/collect}/README.md (100%) rename {collect => swebench/collect}/build_dataset.py (100%) rename {collect => swebench/collect}/build_dataset_ft.py (100%) rename {collect => swebench/collect}/cleanup/delete_gh_workflows.py (100%) rename {collect => swebench/collect}/cleanup/remove_envs.py (100%) rename {collect => swebench/collect}/get_tasks_pipeline.py (100%) rename {collect => swebench/collect}/get_top_pypi.py (100%) rename {collect => swebench/collect}/make_lite/README.md (100%) rename {collect => swebench/collect}/make_lite/criteria.py (100%) rename {collect => swebench/collect}/make_lite/make_lite.py (100%) rename {collect => swebench/collect}/make_repo/call_make_repo.py (100%) rename {collect => swebench/collect}/make_repo/make_repo.sh (100%) rename {collect => swebench/collect}/print_pulls.py (100%) rename {collect => swebench/collect}/run_build_dataset_ft.sh (100%) rename {collect => swebench/collect}/run_get_tasks_pipeline.sh (100%) rename {collect => swebench/collect}/utils.py (100%) rename {harness => swebench/harness}/README.md (100%) rename {harness => swebench/harness}/constants.py (100%) rename {harness => swebench/harness}/context_manager.py (100%) rename {harness => swebench/harness}/engine_evaluation.py (100%) rename {harness => swebench/harness}/engine_validation.py (100%) rename {harness => swebench/harness}/run_evaluation.py (100%) rename {harness => swebench/harness}/run_evaluation.sh (100%) rename {harness => swebench/harness}/run_validation.sh (100%) rename {harness => swebench/harness}/utils.py (100%) rename {metrics => swebench/metrics}/README.md (100%) rename {metrics => swebench/metrics}/conversion.py (100%) rename {metrics => swebench/metrics}/getters.py (100%) rename {metrics => swebench/metrics}/log_parsers.py (100%) rename {metrics => swebench/metrics}/metrics.py (100%) rename {metrics => swebench/metrics}/monitor.py (100%) rename {metrics => swebench/metrics}/report.py (100%) rename {versioning => swebench/versioning}/README.md (100%) rename {versioning => swebench/versioning}/constants.py (100%) rename {versioning => swebench/versioning}/extract_web/get_versions_astropy.py (100%) rename {versioning => swebench/versioning}/extract_web/get_versions_matplotlib.py (100%) rename {versioning => swebench/versioning}/extract_web/get_versions_pvlib-python.py (100%) rename {versioning => swebench/versioning}/extract_web/get_versions_pydicom.py (100%) rename {versioning => swebench/versioning}/extract_web/get_versions_sqlfluff.py (100%) rename {versioning => swebench/versioning}/extract_web/get_versions_xarray.py (100%) rename {versioning => swebench/versioning}/get_versions.py (100%) rename {versioning => swebench/versioning}/run_get_versions.sh (100%) rename {versioning => swebench/versioning}/utils.py (100%) diff --git a/.gitignore b/.gitignore index a3d7955e..5fd289a5 100644 --- a/.gitignore +++ b/.gitignore @@ -160,15 +160,17 @@ cython_debug/ #.idea/ # Custom -notebooks/ -*.patch -data/repos/copies +.api_key .keys .vscode/ *.jsonl *.jsonl.* +*.patch +*.DS_Store analysis/**/*.json analysis/**/scratch* analysis/benchmark/plots/ analysis/evaluation/*.csv analysis/evaluation/*.pdf +data/repos/copies +notebooks/ diff --git a/LICENSE.md b/LICENSE similarity index 100% rename from LICENSE.md rename to LICENSE diff --git a/build_deploy.sh b/build_deploy.sh new file mode 100755 index 00000000..5fe7943b --- /dev/null +++ b/build_deploy.sh @@ -0,0 +1,6 @@ +# !bin/bash + +python3 -m build + +python3 -m twine upload --skip-existing --repository pypi dist/* +# python3 -m twine upload --skip-existing --repository testpypi dist/* \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..fcff5099 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ['setuptools>=42'] +build-backend = 'setuptools.build_meta' \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..1198e3fe --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[metadata] +version = attr: swebench.__version__ +license_files = LICENSE \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..73f771b8 --- /dev/null +++ b/setup.py @@ -0,0 +1,40 @@ +import setuptools + +with open('README.md', 'r', encoding='utf-8') as fh: + long_description = fh.read() + +setuptools.setup( + name='swebench', + author='John Yang', + author_email='byjohnyang@gmail.com', + description='The official SWE-bench package - a benchmark for evaluating LMs on software engineering', + keywords='nlp, benchmark, code', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://swebench.com', + project_urls={ + 'Documentation': 'https://github.com/princeton-nlp/SWE-bench', + 'Bug Reports': 'http://github.com/princeton-nlp/SWE-bench/issues', + 'Source Code': 'http://github.com/princeton-nlp/SWE-bench', + 'Website': 'https://swebench.com', + }, + packages=setuptools.find_packages(), + classifiers=[ + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3 :: Only', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.8', + install_requires=[ + 'beautifulsoup4', + 'chardet', + 'ghapi', + 'GitPython', + 'python-dotenv', + 'requests', + 'rich', + ], + include_package_data=True, +) \ No newline at end of file diff --git a/swebench/__init__.py b/swebench/__init__.py new file mode 100644 index 00000000..095ebbee --- /dev/null +++ b/swebench/__init__.py @@ -0,0 +1,60 @@ +__version__ = "0.6.8.1" + +from swebench.collect.build_dataset import main as build_dataset +from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline +from swebench.collect.print_pulls import main as print_pulls + +from swebench.harness.constants import ( + KEY_INSTANCE_ID, + KEY_MODEL, + KEY_PREDICTION, + MAP_REPO_TO_TEST_FRAMEWORK, + MAP_VERSION_TO_INSTALL, +) + +from swebench.harness.run_evaluation import ( + main as run_evaluation, +) + +from swebench.harness.utils import ( + get_environment_yml, + get_instances, + get_requirements, +) + +from swebench.metrics.conversion import ( + convert_log_to_ground_truth +) + +from swebench.metrics.getters import ( + get_diffs, + get_logs_eval, + get_logs_gold, +) + +from swebench.metrics.log_parsers import ( + MAP_REPO_TO_PARSER, +) + +from swebench.metrics.metrics import ( + compute_fail_to_pass, + compute_fail_to_pass_unweighted, + compute_fail_to_pass_weighted, + compute_pass_to_pass, + compute_pass_to_pass_unweighted, + compute_pass_to_pass_weighted, + get_resolution_status, +) + +from swebench.metrics.monitor import ( + monitor_validation, + monitor_logs_same_diff, +) + +from swebench.metrics.report import ( + get_eval_report, + get_eval_reports_for_logs, + get_eval_reports_for_dir, + get_model_eval_summary, + get_model_report, +) \ No newline at end of file diff --git a/collect/README.md b/swebench/collect/README.md similarity index 100% rename from collect/README.md rename to swebench/collect/README.md diff --git a/collect/build_dataset.py b/swebench/collect/build_dataset.py similarity index 100% rename from collect/build_dataset.py rename to swebench/collect/build_dataset.py diff --git a/collect/build_dataset_ft.py b/swebench/collect/build_dataset_ft.py similarity index 100% rename from collect/build_dataset_ft.py rename to swebench/collect/build_dataset_ft.py diff --git a/collect/cleanup/delete_gh_workflows.py b/swebench/collect/cleanup/delete_gh_workflows.py similarity index 100% rename from collect/cleanup/delete_gh_workflows.py rename to swebench/collect/cleanup/delete_gh_workflows.py diff --git a/collect/cleanup/remove_envs.py b/swebench/collect/cleanup/remove_envs.py similarity index 100% rename from collect/cleanup/remove_envs.py rename to swebench/collect/cleanup/remove_envs.py diff --git a/collect/get_tasks_pipeline.py b/swebench/collect/get_tasks_pipeline.py similarity index 100% rename from collect/get_tasks_pipeline.py rename to swebench/collect/get_tasks_pipeline.py diff --git a/collect/get_top_pypi.py b/swebench/collect/get_top_pypi.py similarity index 100% rename from collect/get_top_pypi.py rename to swebench/collect/get_top_pypi.py diff --git a/collect/make_lite/README.md b/swebench/collect/make_lite/README.md similarity index 100% rename from collect/make_lite/README.md rename to swebench/collect/make_lite/README.md diff --git a/collect/make_lite/criteria.py b/swebench/collect/make_lite/criteria.py similarity index 100% rename from collect/make_lite/criteria.py rename to swebench/collect/make_lite/criteria.py diff --git a/collect/make_lite/make_lite.py b/swebench/collect/make_lite/make_lite.py similarity index 100% rename from collect/make_lite/make_lite.py rename to swebench/collect/make_lite/make_lite.py diff --git a/collect/make_repo/call_make_repo.py b/swebench/collect/make_repo/call_make_repo.py similarity index 100% rename from collect/make_repo/call_make_repo.py rename to swebench/collect/make_repo/call_make_repo.py diff --git a/collect/make_repo/make_repo.sh b/swebench/collect/make_repo/make_repo.sh similarity index 100% rename from collect/make_repo/make_repo.sh rename to swebench/collect/make_repo/make_repo.sh diff --git a/collect/print_pulls.py b/swebench/collect/print_pulls.py similarity index 100% rename from collect/print_pulls.py rename to swebench/collect/print_pulls.py diff --git a/collect/run_build_dataset_ft.sh b/swebench/collect/run_build_dataset_ft.sh similarity index 100% rename from collect/run_build_dataset_ft.sh rename to swebench/collect/run_build_dataset_ft.sh diff --git a/collect/run_get_tasks_pipeline.sh b/swebench/collect/run_get_tasks_pipeline.sh similarity index 100% rename from collect/run_get_tasks_pipeline.sh rename to swebench/collect/run_get_tasks_pipeline.sh diff --git a/collect/utils.py b/swebench/collect/utils.py similarity index 100% rename from collect/utils.py rename to swebench/collect/utils.py diff --git a/harness/README.md b/swebench/harness/README.md similarity index 100% rename from harness/README.md rename to swebench/harness/README.md diff --git a/harness/constants.py b/swebench/harness/constants.py similarity index 100% rename from harness/constants.py rename to swebench/harness/constants.py diff --git a/harness/context_manager.py b/swebench/harness/context_manager.py similarity index 100% rename from harness/context_manager.py rename to swebench/harness/context_manager.py diff --git a/harness/engine_evaluation.py b/swebench/harness/engine_evaluation.py similarity index 100% rename from harness/engine_evaluation.py rename to swebench/harness/engine_evaluation.py diff --git a/harness/engine_validation.py b/swebench/harness/engine_validation.py similarity index 100% rename from harness/engine_validation.py rename to swebench/harness/engine_validation.py diff --git a/harness/run_evaluation.py b/swebench/harness/run_evaluation.py similarity index 100% rename from harness/run_evaluation.py rename to swebench/harness/run_evaluation.py diff --git a/harness/run_evaluation.sh b/swebench/harness/run_evaluation.sh similarity index 100% rename from harness/run_evaluation.sh rename to swebench/harness/run_evaluation.sh diff --git a/harness/run_validation.sh b/swebench/harness/run_validation.sh similarity index 100% rename from harness/run_validation.sh rename to swebench/harness/run_validation.sh diff --git a/harness/utils.py b/swebench/harness/utils.py similarity index 100% rename from harness/utils.py rename to swebench/harness/utils.py diff --git a/metrics/README.md b/swebench/metrics/README.md similarity index 100% rename from metrics/README.md rename to swebench/metrics/README.md diff --git a/metrics/conversion.py b/swebench/metrics/conversion.py similarity index 100% rename from metrics/conversion.py rename to swebench/metrics/conversion.py diff --git a/metrics/getters.py b/swebench/metrics/getters.py similarity index 100% rename from metrics/getters.py rename to swebench/metrics/getters.py diff --git a/metrics/log_parsers.py b/swebench/metrics/log_parsers.py similarity index 100% rename from metrics/log_parsers.py rename to swebench/metrics/log_parsers.py diff --git a/metrics/metrics.py b/swebench/metrics/metrics.py similarity index 100% rename from metrics/metrics.py rename to swebench/metrics/metrics.py diff --git a/metrics/monitor.py b/swebench/metrics/monitor.py similarity index 100% rename from metrics/monitor.py rename to swebench/metrics/monitor.py diff --git a/metrics/report.py b/swebench/metrics/report.py similarity index 100% rename from metrics/report.py rename to swebench/metrics/report.py diff --git a/versioning/README.md b/swebench/versioning/README.md similarity index 100% rename from versioning/README.md rename to swebench/versioning/README.md diff --git a/versioning/constants.py b/swebench/versioning/constants.py similarity index 100% rename from versioning/constants.py rename to swebench/versioning/constants.py diff --git a/versioning/extract_web/get_versions_astropy.py b/swebench/versioning/extract_web/get_versions_astropy.py similarity index 100% rename from versioning/extract_web/get_versions_astropy.py rename to swebench/versioning/extract_web/get_versions_astropy.py diff --git a/versioning/extract_web/get_versions_matplotlib.py b/swebench/versioning/extract_web/get_versions_matplotlib.py similarity index 100% rename from versioning/extract_web/get_versions_matplotlib.py rename to swebench/versioning/extract_web/get_versions_matplotlib.py diff --git a/versioning/extract_web/get_versions_pvlib-python.py b/swebench/versioning/extract_web/get_versions_pvlib-python.py similarity index 100% rename from versioning/extract_web/get_versions_pvlib-python.py rename to swebench/versioning/extract_web/get_versions_pvlib-python.py diff --git a/versioning/extract_web/get_versions_pydicom.py b/swebench/versioning/extract_web/get_versions_pydicom.py similarity index 100% rename from versioning/extract_web/get_versions_pydicom.py rename to swebench/versioning/extract_web/get_versions_pydicom.py diff --git a/versioning/extract_web/get_versions_sqlfluff.py b/swebench/versioning/extract_web/get_versions_sqlfluff.py similarity index 100% rename from versioning/extract_web/get_versions_sqlfluff.py rename to swebench/versioning/extract_web/get_versions_sqlfluff.py diff --git a/versioning/extract_web/get_versions_xarray.py b/swebench/versioning/extract_web/get_versions_xarray.py similarity index 100% rename from versioning/extract_web/get_versions_xarray.py rename to swebench/versioning/extract_web/get_versions_xarray.py diff --git a/versioning/get_versions.py b/swebench/versioning/get_versions.py similarity index 100% rename from versioning/get_versions.py rename to swebench/versioning/get_versions.py diff --git a/versioning/run_get_versions.sh b/swebench/versioning/run_get_versions.sh similarity index 100% rename from versioning/run_get_versions.sh rename to swebench/versioning/run_get_versions.sh diff --git a/versioning/utils.py b/swebench/versioning/utils.py similarity index 100% rename from versioning/utils.py rename to swebench/versioning/utils.py From 11349c09dceebd2c0124e45f6f36b69884ece1ce Mon Sep 17 00:00:00 2001 From: John Yang Date: Thu, 28 Mar 2024 12:49:35 -0400 Subject: [PATCH 2/6] Update all swebench package paths --- swebench/__init__.py | 2 +- swebench/collect/build_dataset.py | 3 +-- swebench/collect/get_tasks_pipeline.py | 4 ++-- swebench/collect/print_pulls.py | 2 +- swebench/harness/context_manager.py | 8 ++++---- swebench/harness/engine_evaluation.py | 12 ++++++------ swebench/harness/engine_validation.py | 4 ++-- swebench/harness/run_evaluation.py | 10 +++++----- swebench/harness/utils.py | 8 ++++---- swebench/metrics/conversion.py | 4 ++-- swebench/metrics/getters.py | 2 +- swebench/metrics/metrics.py | 5 ++++- swebench/metrics/monitor.py | 5 ++--- swebench/metrics/report.py | 6 +++--- 14 files changed, 38 insertions(+), 37 deletions(-) diff --git a/swebench/__init__.py b/swebench/__init__.py index 095ebbee..bff3814d 100644 --- a/swebench/__init__.py +++ b/swebench/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.8.1" +__version__ = "0.6.8.2" from swebench.collect.build_dataset import main as build_dataset from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline diff --git a/swebench/collect/build_dataset.py b/swebench/collect/build_dataset.py index c6e4ac6a..17d78fc4 100755 --- a/swebench/collect/build_dataset.py +++ b/swebench/collect/build_dataset.py @@ -6,8 +6,7 @@ import os from typing import Optional - -from utils import Repo, extract_patches, extract_problem_statement_and_hints +from swebench.collect.utils import Repo, extract_patches, extract_problem_statement_and_hints logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/swebench/collect/get_tasks_pipeline.py b/swebench/collect/get_tasks_pipeline.py index 2f0fb961..0de0f5c0 100755 --- a/swebench/collect/get_tasks_pipeline.py +++ b/swebench/collect/get_tasks_pipeline.py @@ -6,9 +6,9 @@ import traceback from dotenv import load_dotenv -from build_dataset import main as build_dataset -from print_pulls import main as print_pulls from multiprocessing import Pool +from swebench.collect.build_dataset import main as build_dataset +from swebench.collect.print_pulls import main as print_pulls load_dotenv() diff --git a/swebench/collect/print_pulls.py b/swebench/collect/print_pulls.py index ddd5c8d3..3444936c 100755 --- a/swebench/collect/print_pulls.py +++ b/swebench/collect/print_pulls.py @@ -9,7 +9,7 @@ from typing import Optional from fastcore.xtras import obj2dict -from utils import Repo +from swebench.collect.utils import Repo logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/swebench/harness/context_manager.py b/swebench/harness/context_manager.py index ae5e804f..596bf7f8 100644 --- a/swebench/harness/context_manager.py +++ b/swebench/harness/context_manager.py @@ -1,6 +1,6 @@ import logging, os, platform, subprocess -from constants import ( +from swebench.harness.constants import ( APPLY_PATCH_FAIL, APPLY_PATCH_PASS, INSTALL_FAIL, @@ -17,15 +17,15 @@ TESTS_TIMEOUT, TESTS_ERROR, ) -from tempfile import TemporaryDirectory -from traceback import format_exc -from utils import ( +from swebench.harness.utils import ( clone_repo, get_conda_env_names, get_environment_yml, get_requirements, get_test_directives, ) +from tempfile import TemporaryDirectory +from traceback import format_exc logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" diff --git a/swebench/harness/engine_evaluation.py b/swebench/harness/engine_evaluation.py index d6d085b9..db68d8c8 100644 --- a/swebench/harness/engine_evaluation.py +++ b/swebench/harness/engine_evaluation.py @@ -1,21 +1,21 @@ import argparse, os, re -from constants import ( +from multiprocessing import Pool, cpu_count +from swebench.harness.constants import ( APPLY_PATCH_FAIL, KEY_INSTANCE_ID, KEY_MODEL, KEY_PREDICTION, ) -from context_manager import TaskEnvContextManager -from engine_validation import setup_testbed -from multiprocessing import Pool, cpu_count -from tqdm.auto import tqdm -from utils import ( +from swebench.harness.context_manager import TaskEnvContextManager +from swebench.harness.engine_validation import setup_testbed +from swebench.harness.utils import ( extract_minimal_patch, get_instances, split_instances, DotDict ) +from tqdm.auto import tqdm def overwrite_ablation(tcm: TaskEnvContextManager, task_instance: dict): diff --git a/swebench/harness/engine_validation.py b/swebench/harness/engine_validation.py index 3ed1028b..b9864594 100644 --- a/swebench/harness/engine_validation.py +++ b/swebench/harness/engine_validation.py @@ -1,8 +1,8 @@ import argparse, os -from context_manager import TaskEnvContextManager, TestbedContextManager from multiprocessing import Pool, cpu_count -from utils import get_instances, split_instances, DotDict +from swebench.harness.context_manager import TaskEnvContextManager, TestbedContextManager +from swebench.harness.utils import get_instances, split_instances, DotDict SKIP_INSTANCES = {"pytest-dev/pytest": ["6387", "7956", "3805"]} diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py index 29a0387f..c6ec100d 100755 --- a/swebench/harness/run_evaluation.py +++ b/swebench/harness/run_evaluation.py @@ -9,15 +9,15 @@ import os import shutil -from constants import ( +from datasets import load_dataset +from multiprocessing import Pool +from swebench.harness.constants import ( KEY_INSTANCE_ID, KEY_MODEL, KEY_PREDICTION, ) -from datasets import load_dataset -from engine_evaluation import main as eval_engine -from multiprocessing import Pool -from utils import get_instances +from swebench.harness.engine_evaluation import main as eval_engine +from swebench.harness.utils import get_instances logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py index 00e7784d..d03584e9 100644 --- a/swebench/harness/utils.py +++ b/swebench/harness/utils.py @@ -4,15 +4,15 @@ import requests import subprocess -from constants import ( +from datetime import datetime +from dotenv import load_dotenv +from git import Repo +from swebench.harness.constants import ( MAP_REPO_TO_REQS_PATHS, MAP_REPO_TO_ENV_YML_PATHS, SWE_BENCH_URL_RAW, NON_TEST_EXTS, ) -from datetime import datetime -from dotenv import load_dotenv -from git import Repo load_dotenv() diff --git a/swebench/metrics/conversion.py b/swebench/metrics/conversion.py index 10d81fc6..d546dff5 100644 --- a/swebench/metrics/conversion.py +++ b/swebench/metrics/conversion.py @@ -1,7 +1,7 @@ import json, os -from log_parsers import MAP_REPO_TO_PARSER, TestStatus -from getters import ( +from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus +from swebench.metrics.getters import ( get_file_name_from_lp, get_repo_from_lp, log_path_to_sms, diff --git a/swebench/metrics/getters.py b/swebench/metrics/getters.py index cdd190a4..b980475a 100644 --- a/swebench/metrics/getters.py +++ b/swebench/metrics/getters.py @@ -1,6 +1,6 @@ import re -from log_parsers import MAP_REPO_TO_PARSER, TestStatus +from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus # Evaluation Log Constants diff --git a/swebench/metrics/metrics.py b/swebench/metrics/metrics.py index 6f817883..cfc4ba8c 100644 --- a/swebench/metrics/metrics.py +++ b/swebench/metrics/metrics.py @@ -1,6 +1,9 @@ from enum import Enum -from getters import FAIL_TO_FAIL, FAIL_TO_PASS, PASS_TO_FAIL, PASS_TO_PASS from statistics import mean +from swebench.metrics.getters import ( + FAIL_TO_FAIL, FAIL_TO_PASS, + PASS_TO_FAIL, PASS_TO_PASS, +) class ResolvedStatus(Enum): diff --git a/swebench/metrics/monitor.py b/swebench/metrics/monitor.py index 37e66b7d..3aa2bc89 100644 --- a/swebench/metrics/monitor.py +++ b/swebench/metrics/monitor.py @@ -1,12 +1,11 @@ import glob import os - -from log_parsers import MAP_REPO_TO_PARSER -from getters import ( +from swebench.metrics.getters import ( log_path_to_sms, get_diffs, get_repo_from_lp, APPLY_PATCH_FAIL, APPLY_PATCH_PASS, TESTS_TIMEOUT ) +from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER def monitor_validation( diff --git a/swebench/metrics/report.py b/swebench/metrics/report.py index a0ab304e..f8e3962a 100644 --- a/swebench/metrics/report.py +++ b/swebench/metrics/report.py @@ -1,7 +1,7 @@ import glob, json, os from collections import Counter -from getters import ( +from swebench.metrics.getters import ( get_file_name_from_lp, get_logs_eval, get_id_from_lp, @@ -12,8 +12,8 @@ test_failed, test_passed, ) -from log_parsers import TestStatus -from metrics import ( +from swebench.metrics.log_parsers import TestStatus +from swebench.metrics.metrics import ( compute_fail_to_pass_unweighted, compute_fail_to_pass_weighted, compute_pass_to_pass_unweighted, From 1fcd6d5f216163bd1bc70a000f3e94598161271a Mon Sep 17 00:00:00 2001 From: John Yang Date: Thu, 28 Mar 2024 13:23:39 -0400 Subject: [PATCH 3/6] Add missing inits + fix minor internal import issue --- swebench/__init__.py | 18 +++++++++++++++++- swebench/collect/__init__.py | 0 swebench/harness/__init__.py | 0 swebench/metrics/__init__.py | 0 swebench/versioning/__init__.py | 0 swebench/versioning/get_versions.py | 5 ++--- 6 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 swebench/collect/__init__.py create mode 100644 swebench/harness/__init__.py create mode 100644 swebench/metrics/__init__.py create mode 100644 swebench/versioning/__init__.py diff --git a/swebench/__init__.py b/swebench/__init__.py index bff3814d..284de1cf 100644 --- a/swebench/__init__.py +++ b/swebench/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.8.2" +__version__ = "0.6.8.4" from swebench.collect.build_dataset import main as build_dataset from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline @@ -57,4 +57,20 @@ get_eval_reports_for_dir, get_model_eval_summary, get_model_report, +) + +from swebench.versioning.constants import ( + MAP_REPO_TO_VERSION_PATHS, + MAP_REPO_TO_VERSION_PATTERNS, +) + +from swebench.versioning.get_versions import ( + get_version, + map_version_to_task_instances, + get_versions_from_build, + get_versions_from_web, +) + +from swebench.versioning.utils import ( + split_instances, ) \ No newline at end of file diff --git a/swebench/collect/__init__.py b/swebench/collect/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swebench/harness/__init__.py b/swebench/harness/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swebench/metrics/__init__.py b/swebench/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swebench/versioning/__init__.py b/swebench/versioning/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swebench/versioning/get_versions.py b/swebench/versioning/get_versions.py index 5b0825eb..cfc3f8a0 100644 --- a/swebench/versioning/get_versions.py +++ b/swebench/versioning/get_versions.py @@ -2,13 +2,12 @@ from multiprocessing import Pool, Manager -sys.path.append("../harness") -from constants import ( +from swebench.versioning.constants import ( SWE_BENCH_URL_RAW, MAP_REPO_TO_VERSION_PATHS, MAP_REPO_TO_VERSION_PATTERNS, ) -from utils import get_instances, split_instances +from swebench.versioning.utils import get_instances, split_instances logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" From 0c58da1f24efc1477a7b2efe683f5dd8cf998419 Mon Sep 17 00:00:00 2001 From: John Yang Date: Sun, 31 Mar 2024 23:12:13 -0400 Subject: [PATCH 4/6] Add conda link resolution logic to harness --- swebench/harness/constants.py | 106 +++++++++++++++++++++++++- swebench/harness/context_manager.py | 88 ++++++++++++++------- swebench/harness/engine_validation.py | 16 ++-- swebench/harness/utils.py | 12 ++- 4 files changed, 182 insertions(+), 40 deletions(-) diff --git a/swebench/harness/constants.py b/swebench/harness/constants.py index 874dec86..b46f9811 100644 --- a/swebench/harness/constants.py +++ b/swebench/harness/constants.py @@ -3,6 +3,9 @@ "python": "3.6", "packages": "numpy scipy cython pytest pandas matplotlib", "install": "pip install -v --no-use-pep517 --no-build-isolation -e .", + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + } } for k in ["0.20", "0.21", "0.22"] } @@ -12,6 +15,9 @@ "python": "3.7", "packages": "numpy scipy cython pytest pandas matplotlib", "install": "pip install -v --no-use-pep517 --no-build-isolation -e .", + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + } } for k in ["0.23", "0.24"] } @@ -22,6 +28,9 @@ "python": "3.9", "packages": "numpy scipy cython pytest pandas matplotlib joblib threadpoolctl", "install": "pip install -v --no-use-pep517 --no-build-isolation -e .", + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + } } for k in ["1.0", "1.1", "1.2", "1.3", "1.4"] } @@ -50,11 +59,13 @@ "python": "3.9", "packages": "requirements.txt", "install": "pip install -e .", + "pip_packages": "Werkzeug==2.2.2", }, "2.1": { "python": "3.10", "packages": "requirements.txt", "install": "pip install -e .", + "pip_packages": "Werkzeug==2.2.2", }, } MAP_VERSION_TO_INSTALL_FLASK.update( @@ -74,6 +85,7 @@ "python": "3.11", "packages": "requirements.txt", "install": "pip install -e .", + "pip_packages": "Werkzeug==2.2.2" } for k in ["2.2", "2.3"] } @@ -162,9 +174,27 @@ MAP_VERSION_TO_INSTALL_MATPLOTLIB = { k: { - "python": "3.9", + "python": "3.11", "packages": "environment.yml", "install": "python -m pip install -e .", + "pip_packages": " ".join([ + "contourpy==1.1.0", + "cycler==0.11.0", + "fonttools==4.42.1", + "kiwisolver==1.4.5", + "numpy==1.25.2", + "packaging==23.1", + "pillow==10.0.0", + "pyparsing==3.0.9", + "python-dateutil==2.8.2", + "six==1.16.0", + "setuptools==68.1.2", + "setuptools-scm==7.1.0", + "typing-extensions==4.7.1", + ]), + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + } } for k in ["3.5", "3.6", "3.7"] } @@ -174,6 +204,9 @@ "python": "3.8", "packages": "requirements.txt", "install": "python -m pip install -e .", + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + } } for k in ["3.1", "3.2", "3.3", "3.4"] } @@ -184,6 +217,9 @@ "python": "3.7", "packages": "requirements.txt", "install": "python -m pip install -e .", + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + } } for k in ["3.0"] } @@ -193,6 +229,9 @@ k: { "python": "3.5", "install": "python setup.py build; python setup.py install", + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + } } for k in ["2.0", "2.1", "2.2", "1.0", "1.1", "1.2", "1.3", "1.4", "1.5"] } @@ -204,15 +243,50 @@ "pip_packages": "tox", "install": "pip install -e .[test]", "pre_install": ["sed -i 's/pytest/pytest -rA/' tox.ini"], + "arch_specific_packages": { + "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make", + "x86_64": "gxx_linux-64 gcc_linux-64 make", + } } for k in ["1.5", "1.6", "1.7", "1.8", "2.0", "2.1", "2.2", "2.3", "2.4", "3.0"] + \ ["3.1", "3.2", "3.3", "3.4", "3.5", "4.0", "4.1", "4.2", "4.3", "4.4"] + \ ["4.5", "5.0", "5.1", "5.2", "5.3", "6.0", "6.2", "7.0", "7.1", "7.2"] } -for k in ["3.0", "3.1", "3.2", "3.3", "3.4", "3.5", "4.0"]: +for k in ["3.0", "3.1", "3.2", "3.3", "3.4", "3.5", "4.0", "4.1", "4.2", "4.3", "4.4"]: MAP_VERSION_TO_INSTALL_SPHINX[k][ "pre_install" - ].append("sed -i 's/Jinja2>=2.3/Jinja2<3.1/' setup.py") + ].extend([ + "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py", + "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py", + "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py", + "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py", + "sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py", + 'sed -i "s/\'packaging\',/\'packaging\', \'markupsafe<=2.0.1\',/" setup.py', + ]) + if k in ["4.2", "4.3", "4.4"]: + MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([ + "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py", + "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py", + ]) + elif k == "4.1": + MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([ + ( + "grep -q 'sphinxcontrib-htmlhelp>=2.0.0' setup.py && " + "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py || " + "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py" + ), + ( + "grep -q 'sphinxcontrib-serializinghtml>=1.1.5' setup.py && " + "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py || " + "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py" + ) + ]) + else: + MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([ + "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py", + "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py", + ]) + MAP_VERSION_TO_INSTALL_ASTROPY = { k: {"python": "3.9", "install": "pip install -e .[test]"} @@ -247,6 +321,10 @@ k: {"python": "3.9", "packages": "requirements.txt", "install": "pip install -e ."} for k in ["2.10", "2.11", "2.13", "2.14", "2.15", "2.16", "2.17", "2.8", "2.9", "3.0"] } +MAP_VERSION_TO_INSTALL_PYLINT.update({ + k: {**MAP_VERSION_TO_INSTALL_PYLINT[k], "pip_packages": " ".join([ + "astroid==3.0.0a7" + ])} for k in ['3.0']}) MAP_VERSION_TO_INSTALL_XARRAY = { k: { @@ -472,4 +550,24 @@ # Constants - Miscellaneous NON_TEST_EXTS = [".json", ".png", "csv", ".txt", ".md", ".jpg", ".jpeg", ".pkl", ".yml", ".yaml", ".toml"] -SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/" \ No newline at end of file +SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/" + +# Constants - Repo/Version Mapped to Appropriate Conda Link +MAP_REPO_VERSION_TO_CONDA_LINK = { + "django/django": { + "1.11": "py311_23.9.0-0", + }, + "matplotlib/matplotlib": { + "3.1": "py311_23.9.0-0", + "3.2": "py311_23.9.0-0", + "3.3": "py311_23.9.0-0", + "3.4": "py311_23.9.0-0", + "3.0": "py311_23.10.0-1", + }, + "mwaskom/seaborn": {"0.11": None, "0.12": None, "0.13": None}, + "sympy/sympy": { + "1.0": "py39_23.9.0-0", + }, +} + +DEFAULT_CONDA_LINK = "py39_23.10.0-1" \ No newline at end of file diff --git a/swebench/harness/context_manager.py b/swebench/harness/context_manager.py index 596bf7f8..db20af2b 100644 --- a/swebench/harness/context_manager.py +++ b/swebench/harness/context_manager.py @@ -3,6 +3,7 @@ from swebench.harness.constants import ( APPLY_PATCH_FAIL, APPLY_PATCH_PASS, + DEFAULT_CONDA_LINK, INSTALL_FAIL, INSTALL_PASS, INSTALL_TIMEOUT, @@ -10,6 +11,7 @@ KEY_MODEL, MAP_REPO_TO_INSTALL, MAP_REPO_TO_TEST_FRAMEWORK, + MAP_REPO_VERSION_TO_CONDA_LINK, MAP_VERSION_TO_INSTALL, RESET_FAILED, TESTS_FAILED, @@ -76,6 +78,7 @@ def __init__( Args: task_instances (list): List of task instances log_dir (str): Path to log directory + conda_link(str): URL to conda installation to use path_conda (str): Path to conda installation testbed (str): Path to testbed directory verbose (bool): Whether to show logs @@ -186,16 +189,35 @@ def __enter__(self): # Download Miniconda installer if self.conda_link is not None: cmd_line_install_link = self.conda_link - elif platform.system() == "Darwin": - cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-MacOSX-x86_64.sh" - if is_osx_64: - cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-MacOSX-arm64.sh" - elif platform.system() == "Linux": - cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh" - if platform.machine() == "aarch64": - cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-aarch64.sh" else: - raise ValueError("Unknown computer platform " + platform.system()) + cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-" + + # Adjust version for evaluation by repo/version + key, versions = list(self.setup_refs.items())[0] + if len(self.setup_refs) == 1 and len(versions) == 1: + owner, repo = key.split("/") + version = list(versions.keys())[0] + logger_testbed.info(f"[Testbed] {repo}/{version} instances in a single process") + conda_id = MAP_REPO_VERSION_TO_CONDA_LINK.get(repo, {}).get(version, DEFAULT_CONDA_LINK) + cmd_line_install_link += conda_id + logger_testbed.info(f"[Testbed] {repo}/{version} using Miniconda link: {cmd_line_install_link}") + else: + cmd_line_install_link += DEFAULT_CONDA_LINK + logger_testbed.info(f"[Testbed] Multiple repos/versions; using Miniconda link: {cmd_line_install_link}") + + if platform.system() == "Darwin": + if is_osx_64: + cmd_line_install_link += "-MacOSX-arm64.sh" + else: + cmd_line_install_link += "-MacOSX-x86_64.sh" + elif platform.system() == "Linux": + if platform.machine() == "aarch64": + cmd_line_install_link += "-Linux-aarch64.sh" + else: + cmd_line_install_link += "-Linux-x86_64.sh" + else: + raise ValueError("Unknown computer platform " + platform.system()) + download_cmd = [ "wget", cmd_line_install_link, @@ -205,7 +227,7 @@ def __enter__(self): self.exec(download_cmd) # Install Miniconda - install_cmd = ["bash", miniconda_sh, "-b", "-u", "-p", self.path_conda] + install_cmd = ["bash", miniconda_sh, "-b", "-u", "-p", self.path_conda, "&&", "conda", "init", "--all"] self.exec(install_cmd) if is_osx_64: condabin = os.path.join(self.path_conda, "bin", "conda") @@ -219,14 +241,10 @@ def __enter__(self): # Set up conda executables, get existing environments self.path_conda = os.path.abspath(self.path_conda) conda_bin_path = os.path.join(self.path_conda, "bin") - shellenv = os.environ.copy() - shellenv["PATH"] = conda_bin_path + os.pathsep + shellenv["PATH"] - self.exec.subprocess_args["env"] = shellenv path_activate = os.path.join(self.path_conda, "bin", "activate") - exec_type = "mamba" if "mamba" in self.path_conda else "conda" - exec_cmd = os.path.join(self.path_conda, "bin", exec_type) - env_list = get_conda_env_names(exec_cmd, shellenv) + exec_cmd = os.path.join(self.path_conda, "bin", "conda") + env_list = get_conda_env_names(exec_cmd) # Set up testbed (environment, github repo) for each repo for repo, version_to_setup_ref in self.setup_refs.items(): @@ -284,18 +302,20 @@ def __enter__(self): # Install dependencies path_to_reqs = get_requirements(setup_ref_instance, self.testbed) - cmd = f"source {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}" + cmd = f". {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}" logger_testbed.info( f"[Testbed] Installing dependencies for {env_name}; Command: {cmd}" ) self.exec(cmd, shell=True) os.remove(path_to_reqs) elif pkgs == "environment.yml": - # Create environment from yml - path_to_reqs = get_environment_yml( - setup_ref_instance, env_name, self.testbed - ) if "no_use_env" in install and install["no_use_env"]: + # Create environment from yml + path_to_reqs = get_environment_yml( + setup_ref_instance, env_name, + save_path=self.testbed + ) + # `conda create` based installation cmd = f"{exec_cmd} create -c conda-forge -n {env_name} python={install['python']} -y" logger_testbed.info( @@ -310,6 +330,13 @@ def __enter__(self): ) self.exec(cmd.split(" ")) else: + # Create environment from yml + path_to_reqs = get_environment_yml( + setup_ref_instance, env_name, + save_path=self.testbed, + python_version=install["python"] + ) + # `conda env create` based installation cmd = f"{exec_cmd} env create --file {path_to_reqs}" logger_testbed.info( @@ -326,10 +353,19 @@ def __enter__(self): f"[Testbed] Creating environment {env_name}; Command: {cmd}" ) self.exec(cmd.split(" ")) + + arch = platform.machine() + arch_specific_packages = install.get("arch_specific_packages", {}).get(arch, "") + if arch_specific_packages: + cmd = f". {path_activate} {env_name} && conda install {arch_specific_packages} -y" + logger_testbed.info( + f"[Testbed] Installing arch-specific packages for {env_name}; Command: {cmd}" + ) + self.exec(cmd, shell=True) # Install additional packages if specified if "pip_packages" in install: - cmd = f"source {path_activate} {env_name} && pip install {install['pip_packages']}" + cmd = f". {path_activate} {env_name} && pip install {install['pip_packages']}" logger_testbed.info( f"[Testbed] Installing pip packages for {env_name}; Command: {cmd}" ) @@ -419,6 +455,7 @@ def __init__( logger_taskenv.propagate = verbose self.instance = instance self.conda_path = conda_path + self.conda_cache_dir = os.path.join(self.conda_path, "cache") self.cwd = os.getcwd() self.is_eval = is_eval self.testbed = testbed @@ -440,21 +477,18 @@ def __init__( self.log_file = os.path.join(log_dir, log_file_name) self.cmd_activate = ( - f"source {os.path.join(self.conda_path, 'bin', 'activate')} " + f". {os.path.join(self.conda_path, 'bin', 'activate')} " + f"{self.venv} && echo 'activate successful'" ) self.timeout = timeout - shellenv = os.environ.copy() - condabinpath = os.path.join(self.conda_path, "bin") - shellenv["PATH"] = condabinpath + os.pathsep + shellenv["PATH"] self.exec = ExecWrapper( subprocess_args={ "check": True, "shell": False, "capture_output": True, "text": True, - "env": shellenv, + "env": {"CONDA_PKGS_DIRS": self.conda_cache_dir}, } ) diff --git a/swebench/harness/engine_validation.py b/swebench/harness/engine_validation.py index b9864594..06cfe1f8 100644 --- a/swebench/harness/engine_validation.py +++ b/swebench/harness/engine_validation.py @@ -77,14 +77,14 @@ def setup_testbed(data: dict): Args: data: Dict containing task instances and other data - conda_link: URL to conda installation to use - task_instances: List of task instances - log_dir: Path to log directory - path_conda: Path to miniconda3 or anaconda installation - testbed: Path to testbed directory - temp_dir: Path to temporary directory for storing virtual envs - timeout: Timeout (seconds) for testing script execution - verbose: Verbose mode + conda_link: URL to conda installation to use + task_instances: List of task instances + log_dir: Path to log directory + path_conda: Path to miniconda3 or anaconda installation + testbed: Path to testbed directory + temp_dir: Path to temporary directory for storing virtual envs + timeout: Timeout (seconds) for testing script execution + verbose: Verbose mode """ data_dict = DotDict(data) with TestbedContextManager( diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py index 3e5f02b0..235acd46 100644 --- a/swebench/harness/utils.py +++ b/swebench/harness/utils.py @@ -57,7 +57,12 @@ def get_conda_env_names(conda_source: str, env: dict = None) -> list: return env_names -def get_environment_yml(instance: dict, env_name: str, save_path: str = None) -> str: +def get_environment_yml( + instance: dict, + env_name: str, + save_path: str = None, + python_version: str = None, + ) -> str: """ Get environment.yml for given task instance @@ -94,6 +99,11 @@ def get_environment_yml(instance: dict, env_name: str, save_path: str = None) -> if line.startswith("name:"): cleaned.append(f"name: {env_name}") continue + if line.startswith("dependencies:"): + cleaned.append(line) + if python_version is not None: + cleaned.append(f" - python={python_version}") + continue cleaned.append(line) # Return environment.yml as string if no save path given From 68e89ef8d099ca5c23a8fd5681e3f990cf729fd6 Mon Sep 17 00:00:00 2001 From: John Yang Date: Sun, 31 Mar 2024 23:17:29 -0400 Subject: [PATCH 5/6] Add `datasets` as dependency --- setup.py | 1 + swebench/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 73f771b8..2c29a347 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ install_requires=[ 'beautifulsoup4', 'chardet', + 'datasets', 'ghapi', 'GitPython', 'python-dotenv', diff --git a/swebench/__init__.py b/swebench/__init__.py index 284de1cf..d1f4a745 100644 --- a/swebench/__init__.py +++ b/swebench/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.8.4" +__version__ = "1.0.1" from swebench.collect.build_dataset import main as build_dataset from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline From 58d24d1b65b95ed96d57805604aca7adca49861d Mon Sep 17 00:00:00 2001 From: John Yang Date: Mon, 1 Apr 2024 12:43:59 -0400 Subject: [PATCH 6/6] Add install fail logging --- swebench/metrics/constants.py | 29 +++++++++++++++++ swebench/metrics/conversion.py | 13 +++++--- swebench/metrics/getters.py | 30 +++++++----------- swebench/metrics/log_parsers.py | 9 +----- swebench/metrics/metrics.py | 16 +++------- swebench/metrics/monitor.py | 15 ++++++--- swebench/metrics/report.py | 55 ++++++++++++++++++++------------- 7 files changed, 99 insertions(+), 68 deletions(-) create mode 100644 swebench/metrics/constants.py diff --git a/swebench/metrics/constants.py b/swebench/metrics/constants.py new file mode 100644 index 00000000..654435b0 --- /dev/null +++ b/swebench/metrics/constants.py @@ -0,0 +1,29 @@ +from enum import Enum + +# Evaluation Log Constants +APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed" +APPLY_PATCH_PASS = ">>>>> Applied Patch" +INSTALL_FAIL = ">>>>> Init Failed" +INSTALL_PASS = ">>>>> Init Succeeded" +RESET_FAILED = ">>>>> Reset Failed" +TESTS_ERROR = ">>>>> Tests Errored" +TESTS_TIMEOUT = ">>>>> Tests Timed Out" + +# Result Categories +FAIL_TO_PASS = "FAIL_TO_PASS" +FAIL_TO_FAIL = "FAIL_TO_FAIL" +PASS_TO_PASS = "PASS_TO_PASS" +PASS_TO_FAIL = "PASS_TO_FAIL" + +# Test Status Enum +class TestStatus(Enum): + FAILED = "FAILED" + PASSED = "PASSED" + SKIPPED = "SKIPPED" + ERROR = "ERROR" + +# Resolved Status Enum +class ResolvedStatus(Enum): + NO = "RESOLVED_NO" + PARTIAL = "RESOLVED_PARTIAL" + FULL = "RESOLVED_FULL" diff --git a/swebench/metrics/conversion.py b/swebench/metrics/conversion.py index d546dff5..0d158039 100644 --- a/swebench/metrics/conversion.py +++ b/swebench/metrics/conversion.py @@ -1,17 +1,20 @@ import json, os -from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus -from swebench.metrics.getters import ( - get_file_name_from_lp, - get_repo_from_lp, - log_path_to_sms, +from swebench.metrics.constants import ( FAIL_TO_PASS, FAIL_TO_FAIL, PASS_TO_PASS, PASS_TO_FAIL, + TestStatus, +) +from swebench.metrics.getters import ( + get_file_name_from_lp, + get_repo_from_lp, + log_path_to_sms, test_failed, test_passed, ) +from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER def convert_log_to_ground_truth( diff --git a/swebench/metrics/getters.py b/swebench/metrics/getters.py index b980475a..2c3ab3b6 100644 --- a/swebench/metrics/getters.py +++ b/swebench/metrics/getters.py @@ -1,22 +1,14 @@ import re +from swebench.metrics.constants import ( + APPLY_PATCH_FAIL, + APPLY_PATCH_PASS, + RESET_FAILED, + TESTS_ERROR, + TESTS_TIMEOUT, +) from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus - - -# Evaluation Log Constants -APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed" -APPLY_PATCH_PASS = ">>>>> Applied Patch" -INSTALL_FAIL = ">>>>> Init Failed" -INSTALL_PASS = ">>>>> Init Succeeded" -RESET_FAILED = ">>>>> Reset Failed" -TESTS_TIMEOUT = ">>>>> Tests Timed Out" -TESTS_ERROR = ">>>>> Tests Errored" - -# Result Categories -FAIL_TO_PASS = "FAIL_TO_PASS" -FAIL_TO_FAIL = "FAIL_TO_FAIL" -PASS_TO_PASS = "PASS_TO_PASS" -PASS_TO_FAIL = "PASS_TO_FAIL" +from typing import Tuple def get_diffs(sm_1: dict, sm_2: dict) -> dict: @@ -41,7 +33,7 @@ def get_diffs(sm_1: dict, sm_2: dict) -> dict: return diff_map -def get_logs_eval(log_fp: str) -> (dict, bool): +def get_logs_eval(log_fp: str) -> Tuple[dict, bool]: """ Retrieve evaluation results for a task instance from its corresponding log file @@ -65,7 +57,7 @@ def get_logs_eval(log_fp: str) -> (dict, bool): return log_parser(content), True -def get_logs_gold(log_fp: str) -> (str, str): +def get_logs_gold(log_fp: str) -> Tuple[str, str]: """ Retrieve pre-patch, post-patch test logs from a validation log file @@ -92,7 +84,7 @@ def get_logs_gold(log_fp: str) -> (str, str): get_repo_from_lp = lambda x: get_id_from_lp(x).rsplit("-", 1)[0].replace("__", "/") -def log_path_to_sms(log_fp: str, log_parser) -> (list, bool): +def log_path_to_sms(log_fp: str, log_parser) -> Tuple[list, bool]: """ Wrapper for getting log data from log_parser file diff --git a/swebench/metrics/log_parsers.py b/swebench/metrics/log_parsers.py index cdb4f3fa..13869918 100644 --- a/swebench/metrics/log_parsers.py +++ b/swebench/metrics/log_parsers.py @@ -1,13 +1,6 @@ import re -from enum import Enum - - -class TestStatus(Enum): - FAILED = "FAILED" - PASSED = "PASSED" - SKIPPED = "SKIPPED" - ERROR = "ERROR" +from swebench.metrics.constants import TestStatus def parse_log_pytest(log: str) -> dict: diff --git a/swebench/metrics/metrics.py b/swebench/metrics/metrics.py index cfc4ba8c..eef133a1 100644 --- a/swebench/metrics/metrics.py +++ b/swebench/metrics/metrics.py @@ -1,17 +1,11 @@ -from enum import Enum from statistics import mean -from swebench.metrics.getters import ( - FAIL_TO_FAIL, FAIL_TO_PASS, - PASS_TO_FAIL, PASS_TO_PASS, +from swebench.metrics.constants import ( + FAIL_TO_PASS, + PASS_TO_PASS, + ResolvedStatus, ) -class ResolvedStatus(Enum): - NO = "RESOLVED_NO" - PARTIAL = "RESOLVED_PARTIAL" - FULL = "RESOLVED_FULL" - - def compute_fail_to_pass(report: dict) -> float: """ Compute fail-to-pass metric. Accepts single report as argument. @@ -94,4 +88,4 @@ def get_resolution_status(report: dict) -> str: elif f2p < 1 and f2p > 0 and p2p == 1: return ResolvedStatus.PARTIAL.value else: - return ResolvedStatus.NO.value \ No newline at end of file + return ResolvedStatus.NO.value diff --git a/swebench/metrics/monitor.py b/swebench/metrics/monitor.py index 3aa2bc89..dd047ca4 100644 --- a/swebench/metrics/monitor.py +++ b/swebench/metrics/monitor.py @@ -1,16 +1,23 @@ import glob import os +from swebench.metrics.constants import ( + APPLY_PATCH_FAIL, + APPLY_PATCH_PASS, + TESTS_TIMEOUT +) from swebench.metrics.getters import ( - log_path_to_sms, get_diffs, get_repo_from_lp, - APPLY_PATCH_FAIL, APPLY_PATCH_PASS, TESTS_TIMEOUT + log_path_to_sms, + get_diffs, + get_repo_from_lp, ) from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER +from typing import Tuple def monitor_validation( path_to_logs: str, log_prefix: str = None -) -> (list, list, list, list): +) -> Tuple[list, list, list, list]: """ Check log files generated from a `check_instances` run to see how many instances were successfully installed and/or tested. @@ -79,7 +86,7 @@ def monitor_validation( return failed_install, corrupt_test_patch, corrupt_patch, timeout, success -def monitor_logs_same_diff(log_dir: str, repo: str = None) -> (list, list): +def monitor_logs_same_diff(log_dir: str, repo: str = None) -> Tuple[list, list]: """ Given a log directory and repo, return a list of logs where pre-test and post-test logs are same/different diff --git a/swebench/metrics/report.py b/swebench/metrics/report.py index f8e3962a..7856b5e5 100644 --- a/swebench/metrics/report.py +++ b/swebench/metrics/report.py @@ -1,25 +1,32 @@ import glob, json, os from collections import Counter -from swebench.metrics.getters import ( - get_file_name_from_lp, - get_logs_eval, - get_id_from_lp, +from swebench.harness.constants import ( + INSTALL_FAIL, + KEY_INSTANCE_ID, +) +from swebench.metrics.constants import ( FAIL_TO_FAIL, FAIL_TO_PASS, PASS_TO_FAIL, PASS_TO_PASS, +) +from swebench.metrics.getters import ( + get_file_name_from_lp, + get_logs_eval, + get_id_from_lp, test_failed, test_passed, ) -from swebench.metrics.log_parsers import TestStatus from swebench.metrics.metrics import ( compute_fail_to_pass_unweighted, compute_fail_to_pass_weighted, compute_pass_to_pass_unweighted, compute_pass_to_pass_weighted, get_resolution_status, + ResolvedStatus, ) +from typing import Tuple ### MARK - Eval Report Generation @@ -119,7 +126,7 @@ def get_eval_reports_for_logs( swe_bench_tasks: str, callback: callable = None, verbose: bool = False, -) -> (dict, dict): +) -> Tuple[dict, dict]: """ Wrapper for getting eval report for a list of evaluation log paths. @@ -135,7 +142,7 @@ def get_eval_reports_for_logs( reports_patch_success = {} reports_patch_failure = {} eval_refs = json.load(open(swe_bench_tasks)) - eval_refs = {t['instance_id']: t for t in eval_refs} + eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs} for eval_log in eval_logs: # Remove task instances that do not satisfy callback @@ -194,7 +201,7 @@ def get_model_eval_summary( eval_dir: str, swe_bench_tasks: str, repo: str = None, -): +) -> dict: """ Generate a summary of model evaluation results. @@ -213,7 +220,7 @@ def get_model_eval_summary( # Filter by repo if provided criteria_eval_sm = None if repo is not None: - criteria_pred = lambda pred: repo in pred["instance_id"] + criteria_pred = lambda pred: repo in pred[KEY_INSTANCE_ID] criteria_eval_sm = lambda eval_log: repo in eval_log preds = [x for x in preds if criteria_pred(x)] @@ -257,7 +264,7 @@ def get_model_eval_summary( def get_model_report( model: str, predictions_path: str, swe_bench_tasks: str, log_dir: str -): +) -> dict: """ Generate a report of model evaluation results from predictions, task instances, and evaluation logs. @@ -271,8 +278,8 @@ def get_model_report( report_map (dict): map of repo to report """ eval_refs = json.load(open(swe_bench_tasks)) - eval_refs = [{key: t[key] for key in ["instance_id", "FAIL_TO_PASS", "PASS_TO_PASS"]} for t in eval_refs] - eval_refs = {t['instance_id']: t for t in eval_refs} + eval_refs = [{key: t[key] for key in [KEY_INSTANCE_ID, FAIL_TO_PASS, PASS_TO_PASS]} for t in eval_refs] + eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs} # Get predictions predictions = [] @@ -286,37 +293,43 @@ def get_model_report( # Iterate through predictions for p in predictions: - repo = p["instance_id"].split(".")[0].rsplit("-", 1)[0].replace("__", "/") + repo = p[KEY_INSTANCE_ID].split(".")[0].rsplit("-", 1)[0].replace("__", "/") if repo not in report_map: report_map[repo] = { "none": [], "generated": [], "with_logs": [], + "install_fail": [], "applied": [], "resolved": [], } # Check if the model patch exists if p["model_patch"] == None: - report_map[repo]["none"].append(p['instance_id']) + report_map[repo]["none"].append(p[KEY_INSTANCE_ID]) continue - report_map[repo]["generated"].append(p['instance_id']) + report_map[repo]["generated"].append(p[KEY_INSTANCE_ID]) # Get log file - log_path = os.path.join(log_dir, f"{p['instance_id']}.{model}.eval.log") + log_path = os.path.join(log_dir, f"{p[KEY_INSTANCE_ID]}.{model}.eval.log") if not os.path.exists(log_path): continue - report_map[repo]["with_logs"].append(p['instance_id']) + report_map[repo]["with_logs"].append(p[KEY_INSTANCE_ID]) + + # Check if install succeeded + if INSTALL_FAIL in open(log_path).read(): + report_map[repo]["install_fail"].append(p[KEY_INSTANCE_ID]) + continue # Get evaluation logs eval_sm, found = get_logs_eval(log_path) if not found: continue - report_map[repo]["applied"].append(p['instance_id']) + report_map[repo]["applied"].append(p[KEY_INSTANCE_ID]) - report = get_eval_report(eval_sm, eval_refs[p["instance_id"]]) - if get_resolution_status(report) == "RESOLVED_FULL": - report_map[repo]["resolved"].append(p['instance_id']) + report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]]) + if get_resolution_status(report) == ResolvedStatus.FULL.value: + report_map[repo]["resolved"].append(p[KEY_INSTANCE_ID]) return report_map