From d2b85ea1fdc89eb07cafdeec5b1061086615796c Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Thu, 28 Mar 2024 12:38:22 -0400
Subject: [PATCH 1/6] Ported over necessary pypi setup files + reorganized
 directory

---
 .gitignore                                    |  8 ++-
 LICENSE.md => LICENSE                         |  0
 build_deploy.sh                               |  6 ++
 pyproject.toml                                |  3 +
 setup.cfg                                     |  3 +
 setup.py                                      | 40 +++++++++++++
 swebench/__init__.py                          | 60 +++++++++++++++++++
 {collect => swebench/collect}/README.md       |  0
 .../collect}/build_dataset.py                 |  0
 .../collect}/build_dataset_ft.py              |  0
 .../collect}/cleanup/delete_gh_workflows.py   |  0
 .../collect}/cleanup/remove_envs.py           |  0
 .../collect}/get_tasks_pipeline.py            |  0
 {collect => swebench/collect}/get_top_pypi.py |  0
 .../collect}/make_lite/README.md              |  0
 .../collect}/make_lite/criteria.py            |  0
 .../collect}/make_lite/make_lite.py           |  0
 .../collect}/make_repo/call_make_repo.py      |  0
 .../collect}/make_repo/make_repo.sh           |  0
 {collect => swebench/collect}/print_pulls.py  |  0
 .../collect}/run_build_dataset_ft.sh          |  0
 .../collect}/run_get_tasks_pipeline.sh        |  0
 {collect => swebench/collect}/utils.py        |  0
 {harness => swebench/harness}/README.md       |  0
 {harness => swebench/harness}/constants.py    |  0
 .../harness}/context_manager.py               |  0
 .../harness}/engine_evaluation.py             |  0
 .../harness}/engine_validation.py             |  0
 .../harness}/run_evaluation.py                |  0
 .../harness}/run_evaluation.sh                |  0
 .../harness}/run_validation.sh                |  0
 {harness => swebench/harness}/utils.py        |  0
 {metrics => swebench/metrics}/README.md       |  0
 {metrics => swebench/metrics}/conversion.py   |  0
 {metrics => swebench/metrics}/getters.py      |  0
 {metrics => swebench/metrics}/log_parsers.py  |  0
 {metrics => swebench/metrics}/metrics.py      |  0
 {metrics => swebench/metrics}/monitor.py      |  0
 {metrics => swebench/metrics}/report.py       |  0
 {versioning => swebench/versioning}/README.md |  0
 .../versioning}/constants.py                  |  0
 .../extract_web/get_versions_astropy.py       |  0
 .../extract_web/get_versions_matplotlib.py    |  0
 .../extract_web/get_versions_pvlib-python.py  |  0
 .../extract_web/get_versions_pydicom.py       |  0
 .../extract_web/get_versions_sqlfluff.py      |  0
 .../extract_web/get_versions_xarray.py        |  0
 .../versioning}/get_versions.py               |  0
 .../versioning}/run_get_versions.sh           |  0
 {versioning => swebench/versioning}/utils.py  |  0
 50 files changed, 117 insertions(+), 3 deletions(-)
 rename LICENSE.md => LICENSE (100%)
 create mode 100755 build_deploy.sh
 create mode 100644 pyproject.toml
 create mode 100644 setup.cfg
 create mode 100644 setup.py
 create mode 100644 swebench/__init__.py
 rename {collect => swebench/collect}/README.md (100%)
 rename {collect => swebench/collect}/build_dataset.py (100%)
 rename {collect => swebench/collect}/build_dataset_ft.py (100%)
 rename {collect => swebench/collect}/cleanup/delete_gh_workflows.py (100%)
 rename {collect => swebench/collect}/cleanup/remove_envs.py (100%)
 rename {collect => swebench/collect}/get_tasks_pipeline.py (100%)
 rename {collect => swebench/collect}/get_top_pypi.py (100%)
 rename {collect => swebench/collect}/make_lite/README.md (100%)
 rename {collect => swebench/collect}/make_lite/criteria.py (100%)
 rename {collect => swebench/collect}/make_lite/make_lite.py (100%)
 rename {collect => swebench/collect}/make_repo/call_make_repo.py (100%)
 rename {collect => swebench/collect}/make_repo/make_repo.sh (100%)
 rename {collect => swebench/collect}/print_pulls.py (100%)
 rename {collect => swebench/collect}/run_build_dataset_ft.sh (100%)
 rename {collect => swebench/collect}/run_get_tasks_pipeline.sh (100%)
 rename {collect => swebench/collect}/utils.py (100%)
 rename {harness => swebench/harness}/README.md (100%)
 rename {harness => swebench/harness}/constants.py (100%)
 rename {harness => swebench/harness}/context_manager.py (100%)
 rename {harness => swebench/harness}/engine_evaluation.py (100%)
 rename {harness => swebench/harness}/engine_validation.py (100%)
 rename {harness => swebench/harness}/run_evaluation.py (100%)
 rename {harness => swebench/harness}/run_evaluation.sh (100%)
 rename {harness => swebench/harness}/run_validation.sh (100%)
 rename {harness => swebench/harness}/utils.py (100%)
 rename {metrics => swebench/metrics}/README.md (100%)
 rename {metrics => swebench/metrics}/conversion.py (100%)
 rename {metrics => swebench/metrics}/getters.py (100%)
 rename {metrics => swebench/metrics}/log_parsers.py (100%)
 rename {metrics => swebench/metrics}/metrics.py (100%)
 rename {metrics => swebench/metrics}/monitor.py (100%)
 rename {metrics => swebench/metrics}/report.py (100%)
 rename {versioning => swebench/versioning}/README.md (100%)
 rename {versioning => swebench/versioning}/constants.py (100%)
 rename {versioning => swebench/versioning}/extract_web/get_versions_astropy.py (100%)
 rename {versioning => swebench/versioning}/extract_web/get_versions_matplotlib.py (100%)
 rename {versioning => swebench/versioning}/extract_web/get_versions_pvlib-python.py (100%)
 rename {versioning => swebench/versioning}/extract_web/get_versions_pydicom.py (100%)
 rename {versioning => swebench/versioning}/extract_web/get_versions_sqlfluff.py (100%)
 rename {versioning => swebench/versioning}/extract_web/get_versions_xarray.py (100%)
 rename {versioning => swebench/versioning}/get_versions.py (100%)
 rename {versioning => swebench/versioning}/run_get_versions.sh (100%)
 rename {versioning => swebench/versioning}/utils.py (100%)

diff --git a/.gitignore b/.gitignore
index a3d7955e..5fd289a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,15 +160,17 @@ cython_debug/
 #.idea/
 
 # Custom
-notebooks/
-*.patch
-data/repos/copies
+.api_key
 .keys
 .vscode/
 *.jsonl
 *.jsonl.*
+*.patch
+*.DS_Store
 analysis/**/*.json
 analysis/**/scratch*
 analysis/benchmark/plots/
 analysis/evaluation/*.csv
 analysis/evaluation/*.pdf
+data/repos/copies
+notebooks/
diff --git a/LICENSE.md b/LICENSE
similarity index 100%
rename from LICENSE.md
rename to LICENSE
diff --git a/build_deploy.sh b/build_deploy.sh
new file mode 100755
index 00000000..5fe7943b
--- /dev/null
+++ b/build_deploy.sh
@@ -0,0 +1,6 @@
+# !bin/bash
+
+python3 -m build
+
+python3 -m twine upload --skip-existing --repository pypi dist/*
+# python3 -m twine upload --skip-existing --repository testpypi dist/*
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..fcff5099
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ['setuptools>=42']
+build-backend = 'setuptools.build_meta'
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..1198e3fe
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,3 @@
+[metadata]
+version = attr: swebench.__version__
+license_files = LICENSE
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..73f771b8
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,40 @@
+import setuptools
+
+with open('README.md', 'r', encoding='utf-8') as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name='swebench',
+    author='John Yang',
+    author_email='byjohnyang@gmail.com',
+    description='The official SWE-bench package - a benchmark for evaluating LMs on software engineering',
+    keywords='nlp, benchmark, code',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    url='https://swebench.com',
+    project_urls={
+        'Documentation': 'https://github.com/princeton-nlp/SWE-bench',
+        'Bug Reports': 'http://github.com/princeton-nlp/SWE-bench/issues',
+        'Source Code': 'http://github.com/princeton-nlp/SWE-bench',
+        'Website': 'https://swebench.com',
+    },
+    packages=setuptools.find_packages(),
+    classifiers=[
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3 :: Only',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.8',
+    install_requires=[
+        'beautifulsoup4',
+        'chardet',
+        'ghapi',
+        'GitPython',
+        'python-dotenv',
+        'requests',
+        'rich',
+    ],
+    include_package_data=True,
+)
\ No newline at end of file
diff --git a/swebench/__init__.py b/swebench/__init__.py
new file mode 100644
index 00000000..095ebbee
--- /dev/null
+++ b/swebench/__init__.py
@@ -0,0 +1,60 @@
+__version__ = "0.6.8.1"
+
+from swebench.collect.build_dataset import main as build_dataset
+from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline
+from swebench.collect.print_pulls import main as print_pulls
+
+from swebench.harness.constants import (
+    KEY_INSTANCE_ID,
+    KEY_MODEL,
+    KEY_PREDICTION,
+    MAP_REPO_TO_TEST_FRAMEWORK,
+    MAP_VERSION_TO_INSTALL,
+)
+
+from swebench.harness.run_evaluation import (
+  main as run_evaluation,
+)
+
+from swebench.harness.utils import (
+    get_environment_yml,
+    get_instances,
+    get_requirements,
+)
+
+from swebench.metrics.conversion import (
+    convert_log_to_ground_truth
+)
+
+from swebench.metrics.getters import (
+    get_diffs,
+    get_logs_eval,
+    get_logs_gold,
+)
+
+from swebench.metrics.log_parsers import (
+    MAP_REPO_TO_PARSER,
+)
+
+from swebench.metrics.metrics import (
+    compute_fail_to_pass,
+    compute_fail_to_pass_unweighted,
+    compute_fail_to_pass_weighted,
+    compute_pass_to_pass,
+    compute_pass_to_pass_unweighted,
+    compute_pass_to_pass_weighted,
+    get_resolution_status,
+)
+
+from swebench.metrics.monitor import (
+    monitor_validation,
+    monitor_logs_same_diff,
+)
+
+from swebench.metrics.report import (
+    get_eval_report,
+    get_eval_reports_for_logs,
+    get_eval_reports_for_dir,
+    get_model_eval_summary,
+    get_model_report,
+)
\ No newline at end of file
diff --git a/collect/README.md b/swebench/collect/README.md
similarity index 100%
rename from collect/README.md
rename to swebench/collect/README.md
diff --git a/collect/build_dataset.py b/swebench/collect/build_dataset.py
similarity index 100%
rename from collect/build_dataset.py
rename to swebench/collect/build_dataset.py
diff --git a/collect/build_dataset_ft.py b/swebench/collect/build_dataset_ft.py
similarity index 100%
rename from collect/build_dataset_ft.py
rename to swebench/collect/build_dataset_ft.py
diff --git a/collect/cleanup/delete_gh_workflows.py b/swebench/collect/cleanup/delete_gh_workflows.py
similarity index 100%
rename from collect/cleanup/delete_gh_workflows.py
rename to swebench/collect/cleanup/delete_gh_workflows.py
diff --git a/collect/cleanup/remove_envs.py b/swebench/collect/cleanup/remove_envs.py
similarity index 100%
rename from collect/cleanup/remove_envs.py
rename to swebench/collect/cleanup/remove_envs.py
diff --git a/collect/get_tasks_pipeline.py b/swebench/collect/get_tasks_pipeline.py
similarity index 100%
rename from collect/get_tasks_pipeline.py
rename to swebench/collect/get_tasks_pipeline.py
diff --git a/collect/get_top_pypi.py b/swebench/collect/get_top_pypi.py
similarity index 100%
rename from collect/get_top_pypi.py
rename to swebench/collect/get_top_pypi.py
diff --git a/collect/make_lite/README.md b/swebench/collect/make_lite/README.md
similarity index 100%
rename from collect/make_lite/README.md
rename to swebench/collect/make_lite/README.md
diff --git a/collect/make_lite/criteria.py b/swebench/collect/make_lite/criteria.py
similarity index 100%
rename from collect/make_lite/criteria.py
rename to swebench/collect/make_lite/criteria.py
diff --git a/collect/make_lite/make_lite.py b/swebench/collect/make_lite/make_lite.py
similarity index 100%
rename from collect/make_lite/make_lite.py
rename to swebench/collect/make_lite/make_lite.py
diff --git a/collect/make_repo/call_make_repo.py b/swebench/collect/make_repo/call_make_repo.py
similarity index 100%
rename from collect/make_repo/call_make_repo.py
rename to swebench/collect/make_repo/call_make_repo.py
diff --git a/collect/make_repo/make_repo.sh b/swebench/collect/make_repo/make_repo.sh
similarity index 100%
rename from collect/make_repo/make_repo.sh
rename to swebench/collect/make_repo/make_repo.sh
diff --git a/collect/print_pulls.py b/swebench/collect/print_pulls.py
similarity index 100%
rename from collect/print_pulls.py
rename to swebench/collect/print_pulls.py
diff --git a/collect/run_build_dataset_ft.sh b/swebench/collect/run_build_dataset_ft.sh
similarity index 100%
rename from collect/run_build_dataset_ft.sh
rename to swebench/collect/run_build_dataset_ft.sh
diff --git a/collect/run_get_tasks_pipeline.sh b/swebench/collect/run_get_tasks_pipeline.sh
similarity index 100%
rename from collect/run_get_tasks_pipeline.sh
rename to swebench/collect/run_get_tasks_pipeline.sh
diff --git a/collect/utils.py b/swebench/collect/utils.py
similarity index 100%
rename from collect/utils.py
rename to swebench/collect/utils.py
diff --git a/harness/README.md b/swebench/harness/README.md
similarity index 100%
rename from harness/README.md
rename to swebench/harness/README.md
diff --git a/harness/constants.py b/swebench/harness/constants.py
similarity index 100%
rename from harness/constants.py
rename to swebench/harness/constants.py
diff --git a/harness/context_manager.py b/swebench/harness/context_manager.py
similarity index 100%
rename from harness/context_manager.py
rename to swebench/harness/context_manager.py
diff --git a/harness/engine_evaluation.py b/swebench/harness/engine_evaluation.py
similarity index 100%
rename from harness/engine_evaluation.py
rename to swebench/harness/engine_evaluation.py
diff --git a/harness/engine_validation.py b/swebench/harness/engine_validation.py
similarity index 100%
rename from harness/engine_validation.py
rename to swebench/harness/engine_validation.py
diff --git a/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
similarity index 100%
rename from harness/run_evaluation.py
rename to swebench/harness/run_evaluation.py
diff --git a/harness/run_evaluation.sh b/swebench/harness/run_evaluation.sh
similarity index 100%
rename from harness/run_evaluation.sh
rename to swebench/harness/run_evaluation.sh
diff --git a/harness/run_validation.sh b/swebench/harness/run_validation.sh
similarity index 100%
rename from harness/run_validation.sh
rename to swebench/harness/run_validation.sh
diff --git a/harness/utils.py b/swebench/harness/utils.py
similarity index 100%
rename from harness/utils.py
rename to swebench/harness/utils.py
diff --git a/metrics/README.md b/swebench/metrics/README.md
similarity index 100%
rename from metrics/README.md
rename to swebench/metrics/README.md
diff --git a/metrics/conversion.py b/swebench/metrics/conversion.py
similarity index 100%
rename from metrics/conversion.py
rename to swebench/metrics/conversion.py
diff --git a/metrics/getters.py b/swebench/metrics/getters.py
similarity index 100%
rename from metrics/getters.py
rename to swebench/metrics/getters.py
diff --git a/metrics/log_parsers.py b/swebench/metrics/log_parsers.py
similarity index 100%
rename from metrics/log_parsers.py
rename to swebench/metrics/log_parsers.py
diff --git a/metrics/metrics.py b/swebench/metrics/metrics.py
similarity index 100%
rename from metrics/metrics.py
rename to swebench/metrics/metrics.py
diff --git a/metrics/monitor.py b/swebench/metrics/monitor.py
similarity index 100%
rename from metrics/monitor.py
rename to swebench/metrics/monitor.py
diff --git a/metrics/report.py b/swebench/metrics/report.py
similarity index 100%
rename from metrics/report.py
rename to swebench/metrics/report.py
diff --git a/versioning/README.md b/swebench/versioning/README.md
similarity index 100%
rename from versioning/README.md
rename to swebench/versioning/README.md
diff --git a/versioning/constants.py b/swebench/versioning/constants.py
similarity index 100%
rename from versioning/constants.py
rename to swebench/versioning/constants.py
diff --git a/versioning/extract_web/get_versions_astropy.py b/swebench/versioning/extract_web/get_versions_astropy.py
similarity index 100%
rename from versioning/extract_web/get_versions_astropy.py
rename to swebench/versioning/extract_web/get_versions_astropy.py
diff --git a/versioning/extract_web/get_versions_matplotlib.py b/swebench/versioning/extract_web/get_versions_matplotlib.py
similarity index 100%
rename from versioning/extract_web/get_versions_matplotlib.py
rename to swebench/versioning/extract_web/get_versions_matplotlib.py
diff --git a/versioning/extract_web/get_versions_pvlib-python.py b/swebench/versioning/extract_web/get_versions_pvlib-python.py
similarity index 100%
rename from versioning/extract_web/get_versions_pvlib-python.py
rename to swebench/versioning/extract_web/get_versions_pvlib-python.py
diff --git a/versioning/extract_web/get_versions_pydicom.py b/swebench/versioning/extract_web/get_versions_pydicom.py
similarity index 100%
rename from versioning/extract_web/get_versions_pydicom.py
rename to swebench/versioning/extract_web/get_versions_pydicom.py
diff --git a/versioning/extract_web/get_versions_sqlfluff.py b/swebench/versioning/extract_web/get_versions_sqlfluff.py
similarity index 100%
rename from versioning/extract_web/get_versions_sqlfluff.py
rename to swebench/versioning/extract_web/get_versions_sqlfluff.py
diff --git a/versioning/extract_web/get_versions_xarray.py b/swebench/versioning/extract_web/get_versions_xarray.py
similarity index 100%
rename from versioning/extract_web/get_versions_xarray.py
rename to swebench/versioning/extract_web/get_versions_xarray.py
diff --git a/versioning/get_versions.py b/swebench/versioning/get_versions.py
similarity index 100%
rename from versioning/get_versions.py
rename to swebench/versioning/get_versions.py
diff --git a/versioning/run_get_versions.sh b/swebench/versioning/run_get_versions.sh
similarity index 100%
rename from versioning/run_get_versions.sh
rename to swebench/versioning/run_get_versions.sh
diff --git a/versioning/utils.py b/swebench/versioning/utils.py
similarity index 100%
rename from versioning/utils.py
rename to swebench/versioning/utils.py

From 11349c09dceebd2c0124e45f6f36b69884ece1ce Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Thu, 28 Mar 2024 12:49:35 -0400
Subject: [PATCH 2/6] Update all swebench package paths

---
 swebench/__init__.py                   |  2 +-
 swebench/collect/build_dataset.py      |  3 +--
 swebench/collect/get_tasks_pipeline.py |  4 ++--
 swebench/collect/print_pulls.py        |  2 +-
 swebench/harness/context_manager.py    |  8 ++++----
 swebench/harness/engine_evaluation.py  | 12 ++++++------
 swebench/harness/engine_validation.py  |  4 ++--
 swebench/harness/run_evaluation.py     | 10 +++++-----
 swebench/harness/utils.py              |  8 ++++----
 swebench/metrics/conversion.py         |  4 ++--
 swebench/metrics/getters.py            |  2 +-
 swebench/metrics/metrics.py            |  5 ++++-
 swebench/metrics/monitor.py            |  5 ++---
 swebench/metrics/report.py             |  6 +++---
 14 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/swebench/__init__.py b/swebench/__init__.py
index 095ebbee..bff3814d 100644
--- a/swebench/__init__.py
+++ b/swebench/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.8.1"
+__version__ = "0.6.8.2"
 
 from swebench.collect.build_dataset import main as build_dataset
 from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline
diff --git a/swebench/collect/build_dataset.py b/swebench/collect/build_dataset.py
index c6e4ac6a..17d78fc4 100755
--- a/swebench/collect/build_dataset.py
+++ b/swebench/collect/build_dataset.py
@@ -6,8 +6,7 @@
 import os
 from typing import Optional
 
-
-from utils import Repo, extract_patches, extract_problem_statement_and_hints
+from swebench.collect.utils import Repo, extract_patches, extract_problem_statement_and_hints
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/swebench/collect/get_tasks_pipeline.py b/swebench/collect/get_tasks_pipeline.py
index 2f0fb961..0de0f5c0 100755
--- a/swebench/collect/get_tasks_pipeline.py
+++ b/swebench/collect/get_tasks_pipeline.py
@@ -6,9 +6,9 @@
 import traceback
 
 from dotenv import load_dotenv
-from build_dataset import main as build_dataset
-from print_pulls import main as print_pulls
 from multiprocessing import Pool
+from swebench.collect.build_dataset import main as build_dataset
+from swebench.collect.print_pulls import main as print_pulls
 
 
 load_dotenv()
diff --git a/swebench/collect/print_pulls.py b/swebench/collect/print_pulls.py
index ddd5c8d3..3444936c 100755
--- a/swebench/collect/print_pulls.py
+++ b/swebench/collect/print_pulls.py
@@ -9,7 +9,7 @@
 from typing import Optional
 
 from fastcore.xtras import obj2dict
-from utils import Repo
+from swebench.collect.utils import Repo
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/swebench/harness/context_manager.py b/swebench/harness/context_manager.py
index ae5e804f..596bf7f8 100644
--- a/swebench/harness/context_manager.py
+++ b/swebench/harness/context_manager.py
@@ -1,6 +1,6 @@
 import logging, os, platform, subprocess
 
-from constants import (
+from swebench.harness.constants import (
     APPLY_PATCH_FAIL,
     APPLY_PATCH_PASS,
     INSTALL_FAIL,
@@ -17,15 +17,15 @@
     TESTS_TIMEOUT,
     TESTS_ERROR,
 )
-from tempfile import TemporaryDirectory
-from traceback import format_exc
-from utils import (
+from swebench.harness.utils import (
     clone_repo,
     get_conda_env_names,
     get_environment_yml,
     get_requirements,
     get_test_directives,
 )
+from tempfile import TemporaryDirectory
+from traceback import format_exc
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
diff --git a/swebench/harness/engine_evaluation.py b/swebench/harness/engine_evaluation.py
index d6d085b9..db68d8c8 100644
--- a/swebench/harness/engine_evaluation.py
+++ b/swebench/harness/engine_evaluation.py
@@ -1,21 +1,21 @@
 import argparse, os, re
 
-from constants import (
+from multiprocessing import Pool, cpu_count
+from swebench.harness.constants import (
     APPLY_PATCH_FAIL,
     KEY_INSTANCE_ID,
     KEY_MODEL,
     KEY_PREDICTION,
 )
-from context_manager import TaskEnvContextManager
-from engine_validation import setup_testbed
-from multiprocessing import Pool, cpu_count
-from tqdm.auto import tqdm
-from utils import (
+from swebench.harness.context_manager import TaskEnvContextManager
+from swebench.harness.engine_validation import setup_testbed
+from swebench.harness.utils import (
     extract_minimal_patch,
     get_instances,
     split_instances,
     DotDict
 )
+from tqdm.auto import tqdm
 
 
 def overwrite_ablation(tcm: TaskEnvContextManager, task_instance: dict):
diff --git a/swebench/harness/engine_validation.py b/swebench/harness/engine_validation.py
index 3ed1028b..b9864594 100644
--- a/swebench/harness/engine_validation.py
+++ b/swebench/harness/engine_validation.py
@@ -1,8 +1,8 @@
 import argparse, os
 
-from context_manager import TaskEnvContextManager, TestbedContextManager
 from multiprocessing import Pool, cpu_count
-from utils import get_instances, split_instances, DotDict
+from swebench.harness.context_manager import TaskEnvContextManager, TestbedContextManager
+from swebench.harness.utils import get_instances, split_instances, DotDict
 
 
 SKIP_INSTANCES = {"pytest-dev/pytest": ["6387", "7956", "3805"]}
diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
index 29a0387f..c6ec100d 100755
--- a/swebench/harness/run_evaluation.py
+++ b/swebench/harness/run_evaluation.py
@@ -9,15 +9,15 @@
 import os
 import shutil
 
-from constants import (
+from datasets import load_dataset
+from multiprocessing import Pool
+from swebench.harness.constants import (
     KEY_INSTANCE_ID,
     KEY_MODEL,
     KEY_PREDICTION,
 )
-from datasets import load_dataset
-from engine_evaluation import main as eval_engine
-from multiprocessing import Pool
-from utils import get_instances
+from swebench.harness.engine_evaluation import main as eval_engine
+from swebench.harness.utils import get_instances
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py
index 00e7784d..d03584e9 100644
--- a/swebench/harness/utils.py
+++ b/swebench/harness/utils.py
@@ -4,15 +4,15 @@
 import requests
 import subprocess
 
-from constants import (
+from datetime import datetime
+from dotenv import load_dotenv
+from git import Repo
+from swebench.harness.constants import (
     MAP_REPO_TO_REQS_PATHS,
     MAP_REPO_TO_ENV_YML_PATHS,
     SWE_BENCH_URL_RAW,
     NON_TEST_EXTS,
 )
-from datetime import datetime
-from dotenv import load_dotenv
-from git import Repo
 
 
 load_dotenv()
diff --git a/swebench/metrics/conversion.py b/swebench/metrics/conversion.py
index 10d81fc6..d546dff5 100644
--- a/swebench/metrics/conversion.py
+++ b/swebench/metrics/conversion.py
@@ -1,7 +1,7 @@
 import json, os
 
-from log_parsers import MAP_REPO_TO_PARSER, TestStatus
-from getters import (
+from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus
+from swebench.metrics.getters import (
     get_file_name_from_lp,
     get_repo_from_lp,
     log_path_to_sms,
diff --git a/swebench/metrics/getters.py b/swebench/metrics/getters.py
index cdd190a4..b980475a 100644
--- a/swebench/metrics/getters.py
+++ b/swebench/metrics/getters.py
@@ -1,6 +1,6 @@
 import re
 
-from log_parsers import MAP_REPO_TO_PARSER, TestStatus
+from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus
 
 
 # Evaluation Log Constants
diff --git a/swebench/metrics/metrics.py b/swebench/metrics/metrics.py
index 6f817883..cfc4ba8c 100644
--- a/swebench/metrics/metrics.py
+++ b/swebench/metrics/metrics.py
@@ -1,6 +1,9 @@
 from enum import Enum
-from getters import FAIL_TO_FAIL, FAIL_TO_PASS, PASS_TO_FAIL, PASS_TO_PASS
 from statistics import mean
+from swebench.metrics.getters import (
+    FAIL_TO_FAIL, FAIL_TO_PASS,
+    PASS_TO_FAIL, PASS_TO_PASS,
+)
 
 
 class ResolvedStatus(Enum):
diff --git a/swebench/metrics/monitor.py b/swebench/metrics/monitor.py
index 37e66b7d..3aa2bc89 100644
--- a/swebench/metrics/monitor.py
+++ b/swebench/metrics/monitor.py
@@ -1,12 +1,11 @@
 import glob
 import os
 
-
-from log_parsers import MAP_REPO_TO_PARSER
-from getters import (
+from swebench.metrics.getters import (
     log_path_to_sms, get_diffs, get_repo_from_lp,
     APPLY_PATCH_FAIL, APPLY_PATCH_PASS, TESTS_TIMEOUT
 )
+from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER
 
 
 def monitor_validation(
diff --git a/swebench/metrics/report.py b/swebench/metrics/report.py
index a0ab304e..f8e3962a 100644
--- a/swebench/metrics/report.py
+++ b/swebench/metrics/report.py
@@ -1,7 +1,7 @@
 import glob, json, os
 
 from collections import Counter
-from getters import (
+from swebench.metrics.getters import (
     get_file_name_from_lp,
     get_logs_eval,
     get_id_from_lp,
@@ -12,8 +12,8 @@
     test_failed,
     test_passed,
 )
-from log_parsers import TestStatus
-from metrics import (
+from swebench.metrics.log_parsers import TestStatus
+from swebench.metrics.metrics import (
     compute_fail_to_pass_unweighted,
     compute_fail_to_pass_weighted,
     compute_pass_to_pass_unweighted,

From 1fcd6d5f216163bd1bc70a000f3e94598161271a Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Thu, 28 Mar 2024 13:23:39 -0400
Subject: [PATCH 3/6] Add missing inits + fix minor internal import issue

---
 swebench/__init__.py                | 18 +++++++++++++++++-
 swebench/collect/__init__.py        |  0
 swebench/harness/__init__.py        |  0
 swebench/metrics/__init__.py        |  0
 swebench/versioning/__init__.py     |  0
 swebench/versioning/get_versions.py |  5 ++---
 6 files changed, 19 insertions(+), 4 deletions(-)
 create mode 100644 swebench/collect/__init__.py
 create mode 100644 swebench/harness/__init__.py
 create mode 100644 swebench/metrics/__init__.py
 create mode 100644 swebench/versioning/__init__.py

diff --git a/swebench/__init__.py b/swebench/__init__.py
index bff3814d..284de1cf 100644
--- a/swebench/__init__.py
+++ b/swebench/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.8.2"
+__version__ = "0.6.8.4"
 
 from swebench.collect.build_dataset import main as build_dataset
 from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline
@@ -57,4 +57,20 @@
     get_eval_reports_for_dir,
     get_model_eval_summary,
     get_model_report,
+)
+
+from swebench.versioning.constants import (
+    MAP_REPO_TO_VERSION_PATHS,
+    MAP_REPO_TO_VERSION_PATTERNS,
+)
+
+from swebench.versioning.get_versions import (
+    get_version,
+    map_version_to_task_instances,
+    get_versions_from_build,
+    get_versions_from_web,
+)
+
+from swebench.versioning.utils import (
+    split_instances,
 )
\ No newline at end of file
diff --git a/swebench/collect/__init__.py b/swebench/collect/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swebench/harness/__init__.py b/swebench/harness/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swebench/metrics/__init__.py b/swebench/metrics/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swebench/versioning/__init__.py b/swebench/versioning/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swebench/versioning/get_versions.py b/swebench/versioning/get_versions.py
index 5b0825eb..cfc3f8a0 100644
--- a/swebench/versioning/get_versions.py
+++ b/swebench/versioning/get_versions.py
@@ -2,13 +2,12 @@
 
 from multiprocessing import Pool, Manager
 
-sys.path.append("../harness")
-from constants import (
+from swebench.versioning.constants import (
     SWE_BENCH_URL_RAW,
     MAP_REPO_TO_VERSION_PATHS,
     MAP_REPO_TO_VERSION_PATTERNS,
 )
-from utils import get_instances, split_instances
+from swebench.versioning.utils import get_instances, split_instances
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"

From 0c58da1f24efc1477a7b2efe683f5dd8cf998419 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Sun, 31 Mar 2024 23:12:13 -0400
Subject: [PATCH 4/6] Add conda link resolution logic to harness

---
 swebench/harness/constants.py         | 106 +++++++++++++++++++++++++-
 swebench/harness/context_manager.py   |  88 ++++++++++++++-------
 swebench/harness/engine_validation.py |  16 ++--
 swebench/harness/utils.py             |  12 ++-
 4 files changed, 182 insertions(+), 40 deletions(-)

diff --git a/swebench/harness/constants.py b/swebench/harness/constants.py
index 874dec86..b46f9811 100644
--- a/swebench/harness/constants.py
+++ b/swebench/harness/constants.py
@@ -3,6 +3,9 @@
         "python": "3.6",
         "packages": "numpy scipy cython pytest pandas matplotlib",
         "install": "pip install -v --no-use-pep517 --no-build-isolation -e .",
+        "arch_specific_packages": {
+            "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+        }
     }
     for k in ["0.20", "0.21", "0.22"]
 }
@@ -12,6 +15,9 @@
             "python": "3.7",
             "packages": "numpy scipy cython pytest pandas matplotlib",
             "install": "pip install -v --no-use-pep517 --no-build-isolation -e .",
+            "arch_specific_packages": {
+                "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+            }
         }
         for k in ["0.23", "0.24"]
     }
@@ -22,6 +28,9 @@
             "python": "3.9",
             "packages": "numpy scipy cython pytest pandas matplotlib joblib threadpoolctl",
             "install": "pip install -v --no-use-pep517 --no-build-isolation -e .",
+            "arch_specific_packages": {
+                "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+            }
         }
         for k in ["1.0", "1.1", "1.2", "1.3", "1.4"]
     }
@@ -50,11 +59,13 @@
         "python": "3.9",
         "packages": "requirements.txt",
         "install": "pip install -e .",
+        "pip_packages": "Werkzeug==2.2.2",
     },
     "2.1": {
         "python": "3.10",
         "packages": "requirements.txt",
         "install": "pip install -e .",
+        "pip_packages": "Werkzeug==2.2.2",
     },
 }
 MAP_VERSION_TO_INSTALL_FLASK.update(
@@ -74,6 +85,7 @@
             "python": "3.11",
             "packages": "requirements.txt",
             "install": "pip install -e .",
+            "pip_packages": "Werkzeug==2.2.2"
         }
         for k in ["2.2", "2.3"]
     }
@@ -162,9 +174,27 @@
 
 MAP_VERSION_TO_INSTALL_MATPLOTLIB = {
     k: {
-        "python": "3.9",
+        "python": "3.11",
         "packages": "environment.yml",
         "install": "python -m pip install -e .",
+        "pip_packages": " ".join([
+            "contourpy==1.1.0",
+            "cycler==0.11.0",
+            "fonttools==4.42.1",
+            "kiwisolver==1.4.5",
+            "numpy==1.25.2",
+            "packaging==23.1",
+            "pillow==10.0.0",
+            "pyparsing==3.0.9",
+            "python-dateutil==2.8.2",
+            "six==1.16.0",
+            "setuptools==68.1.2",
+            "setuptools-scm==7.1.0",
+            "typing-extensions==4.7.1",
+        ]),
+        "arch_specific_packages": {
+            "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+        }
     }
     for k in ["3.5", "3.6", "3.7"]
 }
@@ -174,6 +204,9 @@
             "python": "3.8",
             "packages": "requirements.txt",
             "install": "python -m pip install -e .",
+            "arch_specific_packages": {
+                "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+            }
         }
         for k in ["3.1", "3.2", "3.3", "3.4"]
     }
@@ -184,6 +217,9 @@
             "python": "3.7",
             "packages": "requirements.txt",
             "install": "python -m pip install -e .",
+            "arch_specific_packages": {
+                "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+            }
         }
         for k in ["3.0"]
     }
@@ -193,6 +229,9 @@
         k: {
             "python": "3.5",
             "install": "python setup.py build; python setup.py install",
+            "arch_specific_packages": {
+                "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+            }
         }
         for k in ["2.0", "2.1", "2.2", "1.0", "1.1", "1.2", "1.3", "1.4", "1.5"]
     }
@@ -204,15 +243,50 @@
         "pip_packages": "tox",
         "install": "pip install -e .[test]",
         "pre_install": ["sed -i 's/pytest/pytest -rA/' tox.ini"],
+        "arch_specific_packages": {
+            "aarch64": "gxx_linux-aarch64 gcc_linux-aarch64 make",
+            "x86_64": "gxx_linux-64 gcc_linux-64 make",
+        }
     } for k in
         ["1.5", "1.6", "1.7", "1.8", "2.0", "2.1", "2.2", "2.3", "2.4", "3.0"] + \
         ["3.1", "3.2", "3.3", "3.4", "3.5", "4.0", "4.1", "4.2", "4.3", "4.4"] + \
         ["4.5", "5.0", "5.1", "5.2", "5.3", "6.0", "6.2", "7.0", "7.1", "7.2"]
 }
-for k in ["3.0", "3.1", "3.2", "3.3", "3.4", "3.5", "4.0"]:
+for k in ["3.0", "3.1", "3.2", "3.3", "3.4", "3.5", "4.0", "4.1", "4.2", "4.3", "4.4"]:
     MAP_VERSION_TO_INSTALL_SPHINX[k][
         "pre_install"
-    ].append("sed -i 's/Jinja2>=2.3/Jinja2<3.1/' setup.py")
+    ].extend([
+        "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
+        "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
+        "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
+        "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
+        "sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
+        'sed -i "s/\'packaging\',/\'packaging\', \'markupsafe<=2.0.1\',/" setup.py',
+    ])
+    if k in ["4.2", "4.3", "4.4"]:
+        MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([
+            "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py",
+            "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py",
+        ])
+    elif k == "4.1":
+        MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([
+            (
+                "grep -q 'sphinxcontrib-htmlhelp>=2.0.0' setup.py && "
+                "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py || "
+                "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py"
+            ),
+            (
+                "grep -q 'sphinxcontrib-serializinghtml>=1.1.5' setup.py && "
+                "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py || "
+                "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py"
+            )
+        ])
+    else:
+        MAP_VERSION_TO_INSTALL_SPHINX[k]["pre_install"].extend([
+            "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py",
+            "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py",
+        ])
+
 
 MAP_VERSION_TO_INSTALL_ASTROPY = {
     k: {"python": "3.9", "install": "pip install -e .[test]"}
@@ -247,6 +321,10 @@
     k: {"python": "3.9", "packages": "requirements.txt", "install": "pip install -e ."}
     for k in ["2.10", "2.11", "2.13", "2.14", "2.15", "2.16", "2.17", "2.8", "2.9", "3.0"]
 }
+MAP_VERSION_TO_INSTALL_PYLINT.update({
+    k: {**MAP_VERSION_TO_INSTALL_PYLINT[k], "pip_packages": " ".join([
+        "astroid==3.0.0a7"
+    ])} for k in ['3.0']})
 
 MAP_VERSION_TO_INSTALL_XARRAY = {
     k: {
@@ -472,4 +550,24 @@
 
 # Constants - Miscellaneous
 NON_TEST_EXTS = [".json", ".png", "csv", ".txt", ".md", ".jpg", ".jpeg", ".pkl", ".yml", ".yaml", ".toml"]
-SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
\ No newline at end of file
+SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
+
+# Constants - Repo/Version Mapped to Appropriate Conda Link
+MAP_REPO_VERSION_TO_CONDA_LINK = {
+    "django/django": {
+        "1.11": "py311_23.9.0-0",
+    },
+    "matplotlib/matplotlib": {
+        "3.1": "py311_23.9.0-0",
+        "3.2": "py311_23.9.0-0",
+        "3.3": "py311_23.9.0-0",
+        "3.4": "py311_23.9.0-0",
+        "3.0": "py311_23.10.0-1",
+    },
+    "mwaskom/seaborn": {"0.11": None, "0.12": None, "0.13": None},
+    "sympy/sympy": {
+        "1.0": "py39_23.9.0-0",
+    },
+}
+
+DEFAULT_CONDA_LINK = "py39_23.10.0-1"
\ No newline at end of file
diff --git a/swebench/harness/context_manager.py b/swebench/harness/context_manager.py
index 596bf7f8..db20af2b 100644
--- a/swebench/harness/context_manager.py
+++ b/swebench/harness/context_manager.py
@@ -3,6 +3,7 @@
 from swebench.harness.constants import (
     APPLY_PATCH_FAIL,
     APPLY_PATCH_PASS,
+    DEFAULT_CONDA_LINK,
     INSTALL_FAIL,
     INSTALL_PASS,
     INSTALL_TIMEOUT,
@@ -10,6 +11,7 @@
     KEY_MODEL,
     MAP_REPO_TO_INSTALL,
     MAP_REPO_TO_TEST_FRAMEWORK,
+    MAP_REPO_VERSION_TO_CONDA_LINK,
     MAP_VERSION_TO_INSTALL,
     RESET_FAILED,
     TESTS_FAILED,
@@ -76,6 +78,7 @@ def __init__(
         Args:
             task_instances (list): List of task instances
             log_dir (str): Path to log directory
+            conda_link(str): URL to conda installation to use
             path_conda (str): Path to conda installation
             testbed (str): Path to testbed directory
             verbose (bool): Whether to show logs
@@ -186,16 +189,35 @@ def __enter__(self):
             # Download Miniconda installer
             if self.conda_link is not None:
                 cmd_line_install_link = self.conda_link
-            elif platform.system() == "Darwin":
-                cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-MacOSX-x86_64.sh"
-                if is_osx_64:
-                    cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-MacOSX-arm64.sh"
-            elif platform.system() == "Linux":
-                cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-x86_64.sh"
-                if platform.machine() == "aarch64":
-                    cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-aarch64.sh"
             else:
-                raise ValueError("Unknown computer platform " + platform.system())
+                cmd_line_install_link = "https://repo.anaconda.com/miniconda/Miniconda3-"
+
+                # Adjust version for evaluation by repo/version
+                key, versions = list(self.setup_refs.items())[0]
+                if len(self.setup_refs) == 1 and len(versions) == 1:
+                    owner, repo = key.split("/")
+                    version = list(versions.keys())[0]
+                    logger_testbed.info(f"[Testbed] {repo}/{version} instances in a single process")
+                    conda_id = MAP_REPO_VERSION_TO_CONDA_LINK.get(repo, {}).get(version, DEFAULT_CONDA_LINK)
+                    cmd_line_install_link += conda_id
+                    logger_testbed.info(f"[Testbed] {repo}/{version} using Miniconda link: {cmd_line_install_link}")
+                else:
+                    cmd_line_install_link += DEFAULT_CONDA_LINK
+                    logger_testbed.info(f"[Testbed] Multiple repos/versions; using Miniconda link: {cmd_line_install_link}")
+
+                if platform.system() == "Darwin":
+                    if is_osx_64:
+                        cmd_line_install_link += "-MacOSX-arm64.sh"
+                    else:
+                        cmd_line_install_link += "-MacOSX-x86_64.sh"
+                elif platform.system() == "Linux":
+                    if platform.machine() == "aarch64":
+                        cmd_line_install_link += "-Linux-aarch64.sh"
+                    else:
+                        cmd_line_install_link += "-Linux-x86_64.sh"
+                else:
+                    raise ValueError("Unknown computer platform " + platform.system())
+
             download_cmd = [
                 "wget",
                 cmd_line_install_link,
@@ -205,7 +227,7 @@ def __enter__(self):
             self.exec(download_cmd)
 
             # Install Miniconda
-            install_cmd = ["bash", miniconda_sh, "-b", "-u", "-p", self.path_conda]
+            install_cmd = ["bash", miniconda_sh, "-b", "-u", "-p", self.path_conda, "&&", "conda", "init", "--all"]
             self.exec(install_cmd)
             if is_osx_64:
                 condabin = os.path.join(self.path_conda, "bin", "conda")
@@ -219,14 +241,10 @@ def __enter__(self):
         # Set up conda executables, get existing environments
         self.path_conda = os.path.abspath(self.path_conda)
         conda_bin_path = os.path.join(self.path_conda, "bin")
-        shellenv = os.environ.copy()
-        shellenv["PATH"] = conda_bin_path + os.pathsep + shellenv["PATH"]
-        self.exec.subprocess_args["env"] = shellenv
 
         path_activate = os.path.join(self.path_conda, "bin", "activate")
-        exec_type = "mamba" if "mamba" in self.path_conda else "conda"
-        exec_cmd = os.path.join(self.path_conda, "bin", exec_type)
-        env_list = get_conda_env_names(exec_cmd, shellenv)
+        exec_cmd = os.path.join(self.path_conda, "bin", "conda")
+        env_list = get_conda_env_names(exec_cmd)
 
         # Set up testbed (environment, github repo) for each repo
         for repo, version_to_setup_ref in self.setup_refs.items():
@@ -284,18 +302,20 @@ def __enter__(self):
 
                     # Install dependencies
                     path_to_reqs = get_requirements(setup_ref_instance, self.testbed)
-                    cmd = f"source {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}"
+                    cmd = f". {path_activate} {env_name} && echo 'activate successful' && pip install -r {path_to_reqs}"
                     logger_testbed.info(
                         f"[Testbed] Installing dependencies for {env_name}; Command: {cmd}"
                     )
                     self.exec(cmd, shell=True)
                     os.remove(path_to_reqs)
                 elif pkgs == "environment.yml":
-                    # Create environment from yml
-                    path_to_reqs = get_environment_yml(
-                        setup_ref_instance, env_name, self.testbed
-                    )
                     if "no_use_env" in install and install["no_use_env"]:
+                        # Create environment from yml
+                        path_to_reqs = get_environment_yml(
+                            setup_ref_instance, env_name,
+                            save_path=self.testbed
+                        )
+
                         # `conda create` based installation
                         cmd = f"{exec_cmd} create -c conda-forge -n {env_name} python={install['python']} -y"
                         logger_testbed.info(
@@ -310,6 +330,13 @@ def __enter__(self):
                         )
                         self.exec(cmd.split(" "))
                     else:
+                        # Create environment from yml
+                        path_to_reqs = get_environment_yml(
+                            setup_ref_instance, env_name,
+                            save_path=self.testbed,
+                            python_version=install["python"]
+                        )
+
                         # `conda env create` based installation
                         cmd = f"{exec_cmd} env create --file {path_to_reqs}"
                         logger_testbed.info(
@@ -326,10 +353,19 @@ def __enter__(self):
                         f"[Testbed] Creating environment {env_name}; Command: {cmd}"
                     )
                     self.exec(cmd.split(" "))
+                
+                arch = platform.machine()
+                arch_specific_packages = install.get("arch_specific_packages", {}).get(arch, "")
+                if arch_specific_packages:
+                    cmd = f". {path_activate} {env_name} && conda install {arch_specific_packages} -y"
+                    logger_testbed.info(
+                        f"[Testbed] Installing arch-specific packages for {env_name}; Command: {cmd}"
+                    )
+                    self.exec(cmd, shell=True)
 
                 # Install additional packages if specified
                 if "pip_packages" in install:
-                    cmd = f"source {path_activate} {env_name} && pip install {install['pip_packages']}"
+                    cmd = f". {path_activate} {env_name} && pip install {install['pip_packages']}"
                     logger_testbed.info(
                         f"[Testbed] Installing pip packages for {env_name}; Command: {cmd}"
                     )
@@ -419,6 +455,7 @@ def __init__(
         logger_taskenv.propagate = verbose
         self.instance = instance
         self.conda_path = conda_path
+        self.conda_cache_dir = os.path.join(self.conda_path, "cache")
         self.cwd = os.getcwd()
         self.is_eval = is_eval
         self.testbed = testbed
@@ -440,21 +477,18 @@ def __init__(
         self.log_file = os.path.join(log_dir, log_file_name)
 
         self.cmd_activate = (
-            f"source {os.path.join(self.conda_path, 'bin', 'activate')} "
+            f". {os.path.join(self.conda_path, 'bin', 'activate')} "
             + f"{self.venv} && echo 'activate successful'"
         )
         self.timeout = timeout
 
-        shellenv = os.environ.copy()
-        condabinpath = os.path.join(self.conda_path, "bin")
-        shellenv["PATH"] = condabinpath + os.pathsep + shellenv["PATH"]
         self.exec = ExecWrapper(
             subprocess_args={
                 "check": True,
                 "shell": False,
                 "capture_output": True,
                 "text": True,
-                "env": shellenv,
+                "env": {"CONDA_PKGS_DIRS": self.conda_cache_dir},
             }
         )
 
diff --git a/swebench/harness/engine_validation.py b/swebench/harness/engine_validation.py
index b9864594..06cfe1f8 100644
--- a/swebench/harness/engine_validation.py
+++ b/swebench/harness/engine_validation.py
@@ -77,14 +77,14 @@ def setup_testbed(data: dict):
 
     Args:
         data: Dict containing task instances and other data
-            conda_link: URL to conda installation to use
-            task_instances: List of task instances
-            log_dir: Path to log directory
-            path_conda: Path to miniconda3 or anaconda installation
-            testbed: Path to testbed directory
-            temp_dir: Path to temporary directory for storing virtual envs
-            timeout: Timeout (seconds) for testing script execution
-            verbose: Verbose mode
+        conda_link: URL to conda installation to use
+        task_instances: List of task instances
+        log_dir: Path to log directory
+        path_conda: Path to miniconda3 or anaconda installation
+        testbed: Path to testbed directory
+        temp_dir: Path to temporary directory for storing virtual envs
+        timeout: Timeout (seconds) for testing script execution
+        verbose: Verbose mode
     """
     data_dict = DotDict(data)
     with TestbedContextManager(
diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py
index 3e5f02b0..235acd46 100644
--- a/swebench/harness/utils.py
+++ b/swebench/harness/utils.py
@@ -57,7 +57,12 @@ def get_conda_env_names(conda_source: str, env: dict = None) -> list:
     return env_names
 
 
-def get_environment_yml(instance: dict, env_name: str, save_path: str = None) -> str:
+def get_environment_yml(
+        instance: dict,
+        env_name: str,
+        save_path: str = None,
+        python_version: str = None,
+    ) -> str:
     """
     Get environment.yml for given task instance
 
@@ -94,6 +99,11 @@ def get_environment_yml(instance: dict, env_name: str, save_path: str = None) ->
         if line.startswith("name:"):
             cleaned.append(f"name: {env_name}")
             continue
+        if line.startswith("dependencies:"):
+            cleaned.append(line)
+            if python_version is not None:
+                cleaned.append(f"  - python={python_version}")
+            continue
         cleaned.append(line)
 
     # Return environment.yml as string if no save path given

From 68e89ef8d099ca5c23a8fd5681e3f990cf729fd6 Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Sun, 31 Mar 2024 23:17:29 -0400
Subject: [PATCH 5/6] Add `datasets` as dependency

---
 setup.py             | 1 +
 swebench/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 73f771b8..2c29a347 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@
     install_requires=[
         'beautifulsoup4',
         'chardet',
+        'datasets',
         'ghapi',
         'GitPython',
         'python-dotenv',
diff --git a/swebench/__init__.py b/swebench/__init__.py
index 284de1cf..d1f4a745 100644
--- a/swebench/__init__.py
+++ b/swebench/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.8.4"
+__version__ = "1.0.1"
 
 from swebench.collect.build_dataset import main as build_dataset
 from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline

From 58d24d1b65b95ed96d57805604aca7adca49861d Mon Sep 17 00:00:00 2001
From: John Yang <byjohnyang@gmail.com>
Date: Mon, 1 Apr 2024 12:43:59 -0400
Subject: [PATCH 6/6] Add install fail logging

---
 swebench/metrics/constants.py   | 29 +++++++++++++++++
 swebench/metrics/conversion.py  | 13 +++++---
 swebench/metrics/getters.py     | 30 +++++++-----------
 swebench/metrics/log_parsers.py |  9 +-----
 swebench/metrics/metrics.py     | 16 +++-------
 swebench/metrics/monitor.py     | 15 ++++++---
 swebench/metrics/report.py      | 55 ++++++++++++++++++++-------------
 7 files changed, 99 insertions(+), 68 deletions(-)
 create mode 100644 swebench/metrics/constants.py

diff --git a/swebench/metrics/constants.py b/swebench/metrics/constants.py
new file mode 100644
index 00000000..654435b0
--- /dev/null
+++ b/swebench/metrics/constants.py
@@ -0,0 +1,29 @@
+from enum import Enum
+
+# Evaluation Log Constants
+APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed"
+APPLY_PATCH_PASS = ">>>>> Applied Patch"
+INSTALL_FAIL = ">>>>> Init Failed"
+INSTALL_PASS = ">>>>> Init Succeeded"
+RESET_FAILED = ">>>>> Reset Failed"
+TESTS_ERROR = ">>>>> Tests Errored"
+TESTS_TIMEOUT = ">>>>> Tests Timed Out"
+
+# Result Categories
+FAIL_TO_PASS = "FAIL_TO_PASS"
+FAIL_TO_FAIL = "FAIL_TO_FAIL"
+PASS_TO_PASS = "PASS_TO_PASS"
+PASS_TO_FAIL = "PASS_TO_FAIL"
+
+# Test Status Enum
+class TestStatus(Enum):
+    FAILED = "FAILED"
+    PASSED = "PASSED"
+    SKIPPED = "SKIPPED"
+    ERROR = "ERROR"
+
+# Resolved Status Enum
+class ResolvedStatus(Enum):
+    NO = "RESOLVED_NO"
+    PARTIAL = "RESOLVED_PARTIAL"
+    FULL = "RESOLVED_FULL"
diff --git a/swebench/metrics/conversion.py b/swebench/metrics/conversion.py
index d546dff5..0d158039 100644
--- a/swebench/metrics/conversion.py
+++ b/swebench/metrics/conversion.py
@@ -1,17 +1,20 @@
 import json, os
 
-from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus
-from swebench.metrics.getters import (
-    get_file_name_from_lp,
-    get_repo_from_lp,
-    log_path_to_sms,
+from swebench.metrics.constants import (
     FAIL_TO_PASS,
     FAIL_TO_FAIL,
     PASS_TO_PASS,
     PASS_TO_FAIL,
+    TestStatus,
+)
+from swebench.metrics.getters import (
+    get_file_name_from_lp,
+    get_repo_from_lp,
+    log_path_to_sms,
     test_failed,
     test_passed,
 )
+from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER
 
 
 def convert_log_to_ground_truth(
diff --git a/swebench/metrics/getters.py b/swebench/metrics/getters.py
index b980475a..2c3ab3b6 100644
--- a/swebench/metrics/getters.py
+++ b/swebench/metrics/getters.py
@@ -1,22 +1,14 @@
 import re
 
+from swebench.metrics.constants import (
+    APPLY_PATCH_FAIL,
+    APPLY_PATCH_PASS,
+    RESET_FAILED,
+    TESTS_ERROR,
+    TESTS_TIMEOUT,
+)
 from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER, TestStatus
-
-
-# Evaluation Log Constants
-APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed"
-APPLY_PATCH_PASS = ">>>>> Applied Patch"
-INSTALL_FAIL = ">>>>> Init Failed"
-INSTALL_PASS = ">>>>> Init Succeeded"
-RESET_FAILED = ">>>>> Reset Failed"
-TESTS_TIMEOUT = ">>>>> Tests Timed Out"
-TESTS_ERROR = ">>>>> Tests Errored"
-
-# Result Categories
-FAIL_TO_PASS = "FAIL_TO_PASS"
-FAIL_TO_FAIL = "FAIL_TO_FAIL"
-PASS_TO_PASS = "PASS_TO_PASS"
-PASS_TO_FAIL = "PASS_TO_FAIL"
+from typing import Tuple
 
 
 def get_diffs(sm_1: dict, sm_2: dict) -> dict:
@@ -41,7 +33,7 @@ def get_diffs(sm_1: dict, sm_2: dict) -> dict:
     return diff_map
 
 
-def get_logs_eval(log_fp: str) -> (dict, bool):
+def get_logs_eval(log_fp: str) -> Tuple[dict, bool]:
     """
     Retrieve evaluation results for a task instance from its corresponding log file
 
@@ -65,7 +57,7 @@ def get_logs_eval(log_fp: str) -> (dict, bool):
         return log_parser(content), True
 
 
-def get_logs_gold(log_fp: str) -> (str, str):
+def get_logs_gold(log_fp: str) -> Tuple[str, str]:
     """
     Retrieve pre-patch, post-patch test logs from a validation log file
 
@@ -92,7 +84,7 @@ def get_logs_gold(log_fp: str) -> (str, str):
 get_repo_from_lp = lambda x: get_id_from_lp(x).rsplit("-", 1)[0].replace("__", "/")
 
 
-def log_path_to_sms(log_fp: str, log_parser) -> (list, bool):
+def log_path_to_sms(log_fp: str, log_parser) -> Tuple[list, bool]:
     """
     Wrapper for getting log data from log_parser file
 
diff --git a/swebench/metrics/log_parsers.py b/swebench/metrics/log_parsers.py
index cdb4f3fa..13869918 100644
--- a/swebench/metrics/log_parsers.py
+++ b/swebench/metrics/log_parsers.py
@@ -1,13 +1,6 @@
 import re
 
-from enum import Enum
-
-
-class TestStatus(Enum):
-    FAILED = "FAILED"
-    PASSED = "PASSED"
-    SKIPPED = "SKIPPED"
-    ERROR = "ERROR"
+from swebench.metrics.constants import TestStatus
 
 
 def parse_log_pytest(log: str) -> dict:
diff --git a/swebench/metrics/metrics.py b/swebench/metrics/metrics.py
index cfc4ba8c..eef133a1 100644
--- a/swebench/metrics/metrics.py
+++ b/swebench/metrics/metrics.py
@@ -1,17 +1,11 @@
-from enum import Enum
 from statistics import mean
-from swebench.metrics.getters import (
-    FAIL_TO_FAIL, FAIL_TO_PASS,
-    PASS_TO_FAIL, PASS_TO_PASS,
+from swebench.metrics.constants import (
+    FAIL_TO_PASS,
+    PASS_TO_PASS,
+    ResolvedStatus,
 )
 
 
-class ResolvedStatus(Enum):
-    NO = "RESOLVED_NO"
-    PARTIAL = "RESOLVED_PARTIAL"
-    FULL = "RESOLVED_FULL"
-
-
 def compute_fail_to_pass(report: dict) -> float:
     """
     Compute fail-to-pass metric. Accepts single report as argument.
@@ -94,4 +88,4 @@ def get_resolution_status(report: dict) -> str:
     elif f2p < 1 and f2p > 0 and p2p == 1:
         return ResolvedStatus.PARTIAL.value
     else:
-        return ResolvedStatus.NO.value
\ No newline at end of file
+        return ResolvedStatus.NO.value
diff --git a/swebench/metrics/monitor.py b/swebench/metrics/monitor.py
index 3aa2bc89..dd047ca4 100644
--- a/swebench/metrics/monitor.py
+++ b/swebench/metrics/monitor.py
@@ -1,16 +1,23 @@
 import glob
 import os
 
+from swebench.metrics.constants import (
+    APPLY_PATCH_FAIL,
+    APPLY_PATCH_PASS,
+    TESTS_TIMEOUT
+)
 from swebench.metrics.getters import (
-    log_path_to_sms, get_diffs, get_repo_from_lp,
-    APPLY_PATCH_FAIL, APPLY_PATCH_PASS, TESTS_TIMEOUT
+    log_path_to_sms,
+    get_diffs,
+    get_repo_from_lp,
 )
 from swebench.metrics.log_parsers import MAP_REPO_TO_PARSER
+from typing import Tuple
 
 
 def monitor_validation(
     path_to_logs: str, log_prefix: str = None
-) -> (list, list, list, list):
+) -> Tuple[list, list, list, list]:
     """
     Check log files generated from a `check_instances` run to see how many instances were successfully
     installed and/or tested.
@@ -79,7 +86,7 @@ def monitor_validation(
     return failed_install, corrupt_test_patch, corrupt_patch, timeout, success
 
 
-def monitor_logs_same_diff(log_dir: str, repo: str = None) -> (list, list):
+def monitor_logs_same_diff(log_dir: str, repo: str = None) -> Tuple[list, list]:
     """
     Given a log directory and repo, return a list of logs where pre-test
     and post-test logs are same/different
diff --git a/swebench/metrics/report.py b/swebench/metrics/report.py
index f8e3962a..7856b5e5 100644
--- a/swebench/metrics/report.py
+++ b/swebench/metrics/report.py
@@ -1,25 +1,32 @@
 import glob, json, os
 
 from collections import Counter
-from swebench.metrics.getters import (
-    get_file_name_from_lp,
-    get_logs_eval,
-    get_id_from_lp,
+from swebench.harness.constants import (
+    INSTALL_FAIL,
+    KEY_INSTANCE_ID,
+)
+from swebench.metrics.constants import (
     FAIL_TO_FAIL,
     FAIL_TO_PASS,
     PASS_TO_FAIL,
     PASS_TO_PASS,
+)
+from swebench.metrics.getters import (
+    get_file_name_from_lp,
+    get_logs_eval,
+    get_id_from_lp,
     test_failed,
     test_passed,
 )
-from swebench.metrics.log_parsers import TestStatus
 from swebench.metrics.metrics import (
     compute_fail_to_pass_unweighted,
     compute_fail_to_pass_weighted,
     compute_pass_to_pass_unweighted,
     compute_pass_to_pass_weighted,
     get_resolution_status,
+    ResolvedStatus,
 )
+from typing import Tuple
 
 
 ### MARK - Eval Report Generation
@@ -119,7 +126,7 @@ def get_eval_reports_for_logs(
     swe_bench_tasks: str,
     callback: callable = None,
     verbose: bool = False,
-) -> (dict, dict):
+) -> Tuple[dict, dict]:
     """
     Wrapper for getting eval report for a list of evaluation log paths.
 
@@ -135,7 +142,7 @@ def get_eval_reports_for_logs(
     reports_patch_success = {}
     reports_patch_failure = {}
     eval_refs = json.load(open(swe_bench_tasks))
-    eval_refs = {t['instance_id']: t for t in eval_refs}
+    eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs}
 
     for eval_log in eval_logs:
         # Remove task instances that do not satisfy callback
@@ -194,7 +201,7 @@ def get_model_eval_summary(
     eval_dir: str,
     swe_bench_tasks: str,
     repo: str = None,
-):
+) -> dict:
     """
     Generate a summary of model evaluation results.
 
@@ -213,7 +220,7 @@ def get_model_eval_summary(
     # Filter by repo if provided
     criteria_eval_sm = None
     if repo is not None:
-        criteria_pred = lambda pred: repo in pred["instance_id"]
+        criteria_pred = lambda pred: repo in pred[KEY_INSTANCE_ID]
         criteria_eval_sm = lambda eval_log: repo in eval_log
         preds = [x for x in preds if criteria_pred(x)]
 
@@ -257,7 +264,7 @@ def get_model_eval_summary(
 
 def get_model_report(
     model: str, predictions_path: str, swe_bench_tasks: str, log_dir: str
-):
+) -> dict:
     """
     Generate a report of model evaluation results from predictions, task instances,
     and evaluation logs.
@@ -271,8 +278,8 @@ def get_model_report(
         report_map (dict): map of repo to report
     """
     eval_refs = json.load(open(swe_bench_tasks))
-    eval_refs = [{key: t[key] for key in ["instance_id", "FAIL_TO_PASS", "PASS_TO_PASS"]} for t in eval_refs]
-    eval_refs = {t['instance_id']: t for t in eval_refs}
+    eval_refs = [{key: t[key] for key in [KEY_INSTANCE_ID, FAIL_TO_PASS, PASS_TO_PASS]} for t in eval_refs]
+    eval_refs = {t[KEY_INSTANCE_ID]: t for t in eval_refs}
 
     # Get predictions
     predictions = []
@@ -286,37 +293,43 @@ def get_model_report(
 
     # Iterate through predictions
     for p in predictions:
-        repo = p["instance_id"].split(".")[0].rsplit("-", 1)[0].replace("__", "/")
+        repo = p[KEY_INSTANCE_ID].split(".")[0].rsplit("-", 1)[0].replace("__", "/")
         if repo not in report_map:
             report_map[repo] = {
                 "none": [],
                 "generated": [],
                 "with_logs": [],
+                "install_fail": [],
                 "applied": [],
                 "resolved": [],
             }
 
         # Check if the model patch exists
         if p["model_patch"] == None:
-            report_map[repo]["none"].append(p['instance_id'])
+            report_map[repo]["none"].append(p[KEY_INSTANCE_ID])
             continue
-        report_map[repo]["generated"].append(p['instance_id'])
+        report_map[repo]["generated"].append(p[KEY_INSTANCE_ID])
 
         # Get log file
-        log_path = os.path.join(log_dir, f"{p['instance_id']}.{model}.eval.log")
+        log_path = os.path.join(log_dir, f"{p[KEY_INSTANCE_ID]}.{model}.eval.log")
         if not os.path.exists(log_path):
             continue
-        report_map[repo]["with_logs"].append(p['instance_id'])
+        report_map[repo]["with_logs"].append(p[KEY_INSTANCE_ID])
+
+        # Check if install succeeded
+        if INSTALL_FAIL in open(log_path).read():
+            report_map[repo]["install_fail"].append(p[KEY_INSTANCE_ID])
+            continue
 
         # Get evaluation logs
         eval_sm, found = get_logs_eval(log_path)
 
         if not found:
             continue
-        report_map[repo]["applied"].append(p['instance_id'])
+        report_map[repo]["applied"].append(p[KEY_INSTANCE_ID])
 
-        report = get_eval_report(eval_sm, eval_refs[p["instance_id"]])
-        if get_resolution_status(report) == "RESOLVED_FULL":
-            report_map[repo]["resolved"].append(p['instance_id'])
+        report = get_eval_report(eval_sm, eval_refs[p[KEY_INSTANCE_ID]])
+        if get_resolution_status(report) == ResolvedStatus.FULL.value:
+            report_map[repo]["resolved"].append(p[KEY_INSTANCE_ID])
 
     return report_map