Skip to content

Add E2B execution backend #106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion commit0/harness/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def main(
repo_name = example["repo"].split("/")[-1]
if split != "all" and repo_name not in SPLIT[split]:
continue
spec = make_spec(example, dataset_type)
spec = make_spec(example, dataset_type, absolute=True)
specs.append(spec)

client = docker.from_env()
Expand Down
6 changes: 5 additions & 1 deletion commit0/harness/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@ def items(self) -> ItemsView[str, object]:
PASS_TO_FAIL = "PASS_TO_FAIL"

# Evaluation backends
EVAL_BACKENDS = ["local", "modal"]
EVAL_BACKENDS = ["local", "modal", "e2b"]
# Use absolute for docker and modal. Backends with sudo access
ABSOLUTE_REPO_DIR = "/testbed"
# Use relative for e2b, with no sudo access
RELATIVE_REPO_DIR = "testbed"

# available commands
COMMANDS = [
Expand Down
4 changes: 2 additions & 2 deletions commit0/harness/docker_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def build_base_images(

"""
# Get the base images to build from the dataset
test_specs = get_specs_from_dataset(dataset, dataset_type)
test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True)
base_images = {
x.base_image_key: (x.base_dockerfile, x.platform) for x in test_specs
}
Expand Down Expand Up @@ -166,7 +166,7 @@ def get_repo_configs_to_build(

"""
image_scripts = dict()
test_specs = get_specs_from_dataset(dataset, dataset_type)
test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True)

for test_spec in test_specs:
# Check if the base image exists
Expand Down
68 changes: 68 additions & 0 deletions commit0/harness/execution_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import modal
import modal.io_streams
from enum import auto
from e2b_code_interpreter import Sandbox
from strenum import StrEnum
from pathlib import Path
import time
Expand All @@ -33,6 +34,7 @@
class ExecutionBackend(StrEnum):
LOCAL = auto()
MODAL = auto()
E2B = auto()


class ExecutionContext(ABC):
Expand Down Expand Up @@ -219,3 +221,69 @@ def __exit__(
exctb: Optional[TracebackType],
) -> None:
close_logger(self.logger)


class E2B(ExecutionContext):
def __init__(
self,
spec: Spec,
logger: logging.Logger,
timeout: int,
num_cpus: int,
log_dir: Path,
files_to_copy: Optional[Files] = None,
files_to_collect: Optional[list[str]] = None,
rebuild_image: bool = False,
):
super().__init__(
spec,
logger,
timeout,
num_cpus,
log_dir,
files_to_copy=files_to_copy,
files_to_collect=files_to_collect,
)

self.sb = Sandbox(timeout=timeout)
self.sb.commands.run("curl -LsSf https://astral.sh/uv/install.sh | sh")

# setup sandbox env
self.sb.files.write("setup.sh", spec.setup_script)
self.sb.commands.run("bash setup.sh")

# prepare for eval
if files_to_copy:
for _, f in files_to_copy.items():
with open(f["src"], "r") as fp: # type: ignore
content = fp.read()
self.sb.files.write(f["dest"].name, content) # type: ignore

def exec_run_with_timeout(self, command: str) -> tuple[str, bool, float]:
"""Execute command on E2B sandbox
For timeouts, we could maybe use the error code or check whether the
sandbox is still alive.

The exit code is given by: result.exit_code

For now, we can just check if the sandbox is still alive.
"""
# TODO: setup timeout
start_time = time.time()
result = self.sb.commands.run(command, timeout=0)
if self.files_to_collect is not None:
for fname in self.files_to_collect:
with (self.log_dir / fname).open("w") as f:
f.write(self.sb.files.read(f"testbed/{fname}"))
timed_out = self.sb.is_running()
end_time = time.time()
return result.stderr, timed_out, end_time - start_time

def __exit__(
self,
exctype: Optional[Type[BaseException]],
excinst: Optional[BaseException],
exctb: Optional[TracebackType],
) -> None:
self.sb.kill()
close_logger(self.logger)
28 changes: 22 additions & 6 deletions commit0/harness/run_pytest_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ExecutionBackend,
Docker,
Modal,
E2B,
)


Expand All @@ -52,6 +53,7 @@ def main(
dataset_name, split=dataset_split
) # type: ignore
dataset_name = dataset_name.lower()
absolute = backend != "e2b"
spec = None
example = None
repo_name = None
Expand All @@ -76,7 +78,7 @@ def main(
if repo_name in os.path.basename(repo_or_repo_dir) or repo_or_repo_dir.endswith(
repo_name
):
spec = make_spec(example, dataset_type)
spec = make_spec(example, dataset_type, absolute)
break
assert spec is not None, "No spec available"
assert example is not None, "No example available"
Expand Down Expand Up @@ -187,19 +189,28 @@ def main(

backend = backend.upper()
if ExecutionBackend(backend) == ExecutionBackend.MODAL:
logger.info("Runnning on Modal")
logger.info("Running on Modal")
execution_context = Modal
elif ExecutionBackend(backend) == ExecutionBackend.LOCAL:
logger.info("Runnning locally")
logger.info("Running locally")
execution_context = Docker
elif ExecutionBackend(backend) == ExecutionBackend.E2B:
logger.info("Running E2B")
execution_context = E2B
else:
raise ValueError(
f"Evaluation must be from {', '.join(EVAL_BACKENDS)}, but {backend} is provided."
)

files_to_copy = Files(
eval_script={"src": eval_file, "dest": Path("/eval.sh")},
patch={"src": patch_file, "dest": Path("/patch.diff")},
eval_script={
"src": eval_file,
"dest": Path("/eval.sh" if absolute else "eval.sh"),
},
patch={
"src": patch_file,
"dest": Path("/patch.diff" if absolute else "patch.diff"),
},
)
files_to_collect = [
"report.json",
Expand All @@ -209,6 +220,11 @@ def main(
if coverage:
files_to_collect.append("coverage.json")

eval_command = (
"/bin/bash /eval.sh"
if ExecutionBackend(backend) != ExecutionBackend.E2B
else "/bin/bash eval.sh"
)
try:
with execution_context(
spec,
Expand All @@ -221,7 +237,7 @@ def main(
rebuild_image,
) as context:
output, timed_out, total_runtime = context.exec_run_with_timeout(
"/bin/bash /eval.sh"
eval_command
)
logger.info(output)
if timed_out:
Expand Down
18 changes: 14 additions & 4 deletions commit0/harness/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from typing import Union, cast, Optional

from commit0.harness.constants import (
ABSOLUTE_REPO_DIR,
RELATIVE_REPO_DIR,
RepoInstance,
SimpleInstance,
)
Expand All @@ -17,6 +19,7 @@
class Spec(ABC):
"""A dataclass that represents a test specification for a single instance of SWE-bench."""

absolute: bool
repo: str
# repo dir on docker
repo_directory: str
Expand Down Expand Up @@ -164,11 +167,12 @@ def make_repo_script_list(self) -> list[str]:

def make_eval_script_list(self) -> list[str]:
"""Run the tests."""
diff_path = "/patch.diff" if self.absolute else "../patch.diff"
eval_script_list = [
f"cd {self.repo_directory}",
"source .venv/bin/activate",
f"git reset --hard {self.instance['base_commit']}",
"git apply --allow-empty -v /patch.diff",
f"git apply --allow-empty -v {diff_path}",
"git status",
f"{self.instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors{{coverage}} {{test_ids}} > test_output.txt 2>&1",
"echo $? > pytest_exit_code.txt",
Expand Down Expand Up @@ -306,39 +310,45 @@ def make_eval_script_list(self) -> list[str]:
def get_specs_from_dataset(
dataset: Union[list[Union[RepoInstance, SimpleInstance]], list[Spec]],
dataset_type: str,
absolute: bool,
) -> list[Spec]:
"""Idempotent function that converts a list of RepoInstance objects to a list of Spec objects."""
if isinstance(dataset[0], Spec):
return cast(list[Spec], dataset)
return list(
map(
lambda instance: make_spec(instance, dataset_type),
lambda instance: make_spec(instance, dataset_type, absolute),
cast(list["RepoInstance"], dataset),
)
)


def make_spec(instance: Union[RepoInstance, SimpleInstance], dataset_type: str) -> Spec:
def make_spec(
instance: Union[RepoInstance, SimpleInstance], dataset_type: str, absolute: bool
) -> Spec:
repo_directory = ABSOLUTE_REPO_DIR if absolute else RELATIVE_REPO_DIR
if isinstance(instance, Spec):
return instance
repo_directory = "/testbed"
if dataset_type == "commit0":
return Commit0Spec(
repo=instance["instance_id"],
repo_directory=repo_directory,
instance=instance,
absolute=absolute,
)
elif dataset_type == "swebench":
return SWEBenchSpec(
repo=instance["instance_id"],
repo_directory=repo_directory,
instance=instance,
absolute=absolute,
)
elif dataset_type == "simple":
return SimpleSpec(
repo="simple", # all benchmarks with mere function writing will share the simple docker image
repo_directory=repo_directory,
instance=instance,
absolute=absolute,
)
else:
raise NotImplementedError(
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ dependencies = [
"datasets==3.0.1",
"modal>=0.66.26",
"strenum>=0.4.15",
"e2b-code-interpreter>=1.0.4",
"python-dotenv>=1.0.1",
]
classifiers = [
"License :: OSI Approved :: MIT License",
Expand Down
36 changes: 36 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading