commit-0 · wenting-zhao · Jan 28, 2025 · Jan 27, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/commit0/harness/build.py b/commit0/harness/build.py
@@ -45,7 +45,7 @@ def main(
             repo_name = example["repo"].split("/")[-1]
             if split != "all" and repo_name not in SPLIT[split]:
                 continue
-        spec = make_spec(example, dataset_type)
+        spec = make_spec(example, dataset_type, absolute=True)
         specs.append(spec)
 
     client = docker.from_env()

diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py
@@ -64,7 +64,11 @@ def items(self) -> ItemsView[str, object]:
 PASS_TO_FAIL = "PASS_TO_FAIL"
 
 # Evaluation backends
-EVAL_BACKENDS = ["local", "modal"]
+EVAL_BACKENDS = ["local", "modal", "e2b"]
+# Use absolute for docker and modal. Backends with sudo access
+ABSOLUTE_REPO_DIR = "/testbed"
+# Use relative for e2b, with no sudo access
+RELATIVE_REPO_DIR = "testbed"
 
 # available commands
 COMMANDS = [

diff --git a/commit0/harness/docker_build.py b/commit0/harness/docker_build.py
@@ -125,7 +125,7 @@ def build_base_images(
 
     """
     # Get the base images to build from the dataset
-    test_specs = get_specs_from_dataset(dataset, dataset_type)
+    test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True)
     base_images = {
         x.base_image_key: (x.base_dockerfile, x.platform) for x in test_specs
     }
@@ -166,7 +166,7 @@ def get_repo_configs_to_build(
 
     """
     image_scripts = dict()
-    test_specs = get_specs_from_dataset(dataset, dataset_type)
+    test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True)
 
     for test_spec in test_specs:
         # Check if the base image exists

diff --git a/commit0/harness/execution_context.py b/commit0/harness/execution_context.py
@@ -10,6 +10,7 @@
 import modal
 import modal.io_streams
 from enum import auto
+from e2b_code_interpreter import Sandbox
 from strenum import StrEnum
 from pathlib import Path
 import time
@@ -33,6 +34,7 @@
 class ExecutionBackend(StrEnum):
     LOCAL = auto()
     MODAL = auto()
+    E2B = auto()
 
 
 class ExecutionContext(ABC):
@@ -219,3 +221,69 @@ def __exit__(
         exctb: Optional[TracebackType],
     ) -> None:
         close_logger(self.logger)
+
+
+class E2B(ExecutionContext):
+    def __init__(
+        self,
+        spec: Spec,
+        logger: logging.Logger,
+        timeout: int,
+        num_cpus: int,
+        log_dir: Path,
+        files_to_copy: Optional[Files] = None,
+        files_to_collect: Optional[list[str]] = None,
+        rebuild_image: bool = False,
+    ):
+        super().__init__(
+            spec,
+            logger,
+            timeout,
+            num_cpus,
+            log_dir,
+            files_to_copy=files_to_copy,
+            files_to_collect=files_to_collect,
+        )
+
+        self.sb = Sandbox(timeout=timeout)
+        self.sb.commands.run("curl -LsSf https://astral.sh/uv/install.sh | sh")
+
+        # setup sandbox env
+        self.sb.files.write("setup.sh", spec.setup_script)
+        self.sb.commands.run("bash setup.sh")
+
+        # prepare for eval
+        if files_to_copy:
+            for _, f in files_to_copy.items():
+                with open(f["src"], "r") as fp:  # type: ignore
+                    content = fp.read()
+                    self.sb.files.write(f["dest"].name, content)  # type: ignore
+
+    def exec_run_with_timeout(self, command: str) -> tuple[str, bool, float]:
+        """Execute command on E2B sandbox
+        For timeouts, we could maybe use the error code or check whether the
+        sandbox is still alive.
+
+        The exit code is given by: result.exit_code
+
+        For now, we can just check if the sandbox is still alive.
+        """
+        # TODO: setup timeout
+        start_time = time.time()
+        result = self.sb.commands.run(command, timeout=0)
+        if self.files_to_collect is not None:
+            for fname in self.files_to_collect:
+                with (self.log_dir / fname).open("w") as f:
+                    f.write(self.sb.files.read(f"testbed/{fname}"))
+        timed_out = self.sb.is_running()
+        end_time = time.time()
+        return result.stderr, timed_out, end_time - start_time
+
+    def __exit__(
+        self,
+        exctype: Optional[Type[BaseException]],
+        excinst: Optional[BaseException],
+        exctb: Optional[TracebackType],
+    ) -> None:
+        self.sb.kill()
+        close_logger(self.logger)
diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py
@@ -26,6 +26,7 @@
     ExecutionBackend,
     Docker,
     Modal,
+    E2B,
 )
 
 
@@ -52,6 +53,7 @@ def main(
         dataset_name, split=dataset_split
     )  # type: ignore
     dataset_name = dataset_name.lower()
+    absolute = backend != "e2b"
     spec = None
     example = None
     repo_name = None
@@ -76,7 +78,7 @@ def main(
         if repo_name in os.path.basename(repo_or_repo_dir) or repo_or_repo_dir.endswith(
             repo_name
         ):
-            spec = make_spec(example, dataset_type)
+            spec = make_spec(example, dataset_type, absolute)
             break
     assert spec is not None, "No spec available"
     assert example is not None, "No example available"
@@ -187,19 +189,28 @@ def main(
 
     backend = backend.upper()
     if ExecutionBackend(backend) == ExecutionBackend.MODAL:
-        logger.info("Runnning on Modal")
+        logger.info("Running on Modal")
         execution_context = Modal
     elif ExecutionBackend(backend) == ExecutionBackend.LOCAL:
-        logger.info("Runnning locally")
+        logger.info("Running locally")
         execution_context = Docker
+    elif ExecutionBackend(backend) == ExecutionBackend.E2B:
+        logger.info("Running E2B")
+        execution_context = E2B
     else:
         raise ValueError(
             f"Evaluation must be from {', '.join(EVAL_BACKENDS)}, but {backend} is provided."
         )
 
     files_to_copy = Files(
-        eval_script={"src": eval_file, "dest": Path("/eval.sh")},
-        patch={"src": patch_file, "dest": Path("/patch.diff")},
+        eval_script={
+            "src": eval_file,
+            "dest": Path("/eval.sh" if absolute else "eval.sh"),
+        },
+        patch={
+            "src": patch_file,
+            "dest": Path("/patch.diff" if absolute else "patch.diff"),
+        },
     )
     files_to_collect = [
         "report.json",
@@ -209,6 +220,11 @@ def main(
     if coverage:
         files_to_collect.append("coverage.json")
 
+    eval_command = (
+        "/bin/bash /eval.sh"
+        if ExecutionBackend(backend) != ExecutionBackend.E2B
+        else "/bin/bash eval.sh"
+    )
     try:
         with execution_context(
             spec,
@@ -221,7 +237,7 @@ def main(
             rebuild_image,
         ) as context:
             output, timed_out, total_runtime = context.exec_run_with_timeout(
-                "/bin/bash /eval.sh"
+                eval_command
             )
             logger.info(output)
             if timed_out:

diff --git a/commit0/harness/spec.py b/commit0/harness/spec.py
@@ -4,6 +4,8 @@
 from typing import Union, cast, Optional
 
 from commit0.harness.constants import (
+    ABSOLUTE_REPO_DIR,
+    RELATIVE_REPO_DIR,
     RepoInstance,
     SimpleInstance,
 )
@@ -17,6 +19,7 @@
 class Spec(ABC):
     """A dataclass that represents a test specification for a single instance of SWE-bench."""
 
+    absolute: bool
     repo: str
     # repo dir on docker
     repo_directory: str
@@ -164,11 +167,12 @@ def make_repo_script_list(self) -> list[str]:
 
     def make_eval_script_list(self) -> list[str]:
         """Run the tests."""
+        diff_path = "/patch.diff" if self.absolute else "../patch.diff"
         eval_script_list = [
             f"cd {self.repo_directory}",
             "source .venv/bin/activate",
             f"git reset --hard {self.instance['base_commit']}",
-            "git apply --allow-empty -v /patch.diff",
+            f"git apply --allow-empty -v {diff_path}",
             "git status",
             f"{self.instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors{{coverage}} {{test_ids}} > test_output.txt 2>&1",
             "echo $? > pytest_exit_code.txt",
@@ -306,39 +310,45 @@ def make_eval_script_list(self) -> list[str]:
 def get_specs_from_dataset(
     dataset: Union[list[Union[RepoInstance, SimpleInstance]], list[Spec]],
     dataset_type: str,
+    absolute: bool,
 ) -> list[Spec]:
     """Idempotent function that converts a list of RepoInstance objects to a list of Spec objects."""
     if isinstance(dataset[0], Spec):
         return cast(list[Spec], dataset)
     return list(
         map(
-            lambda instance: make_spec(instance, dataset_type),
+            lambda instance: make_spec(instance, dataset_type, absolute),
             cast(list["RepoInstance"], dataset),
         )
     )
 
 
-def make_spec(instance: Union[RepoInstance, SimpleInstance], dataset_type: str) -> Spec:
+def make_spec(
+    instance: Union[RepoInstance, SimpleInstance], dataset_type: str, absolute: bool
+) -> Spec:
+    repo_directory = ABSOLUTE_REPO_DIR if absolute else RELATIVE_REPO_DIR
     if isinstance(instance, Spec):
         return instance
-    repo_directory = "/testbed"
     if dataset_type == "commit0":
         return Commit0Spec(
             repo=instance["instance_id"],
             repo_directory=repo_directory,
             instance=instance,
+            absolute=absolute,
         )
     elif dataset_type == "swebench":
         return SWEBenchSpec(
             repo=instance["instance_id"],
             repo_directory=repo_directory,
             instance=instance,
+            absolute=absolute,
         )
     elif dataset_type == "simple":
         return SimpleSpec(
             repo="simple",  # all benchmarks with mere function writing will share the simple docker image
             repo_directory=repo_directory,
             instance=instance,
+            absolute=absolute,
         )
     else:
         raise NotImplementedError(

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,8 @@ dependencies = [
     "datasets==3.0.1",
     "modal>=0.66.26",
     "strenum>=0.4.15",
+    "e2b-code-interpreter>=1.0.4",
+    "python-dotenv>=1.0.1",
 ]
 classifiers = [
     "License :: OSI Approved :: MIT License",

diff --git a/uv.lock b/uv.lock