Skip to content

Commit 4043a34

Browse files
authored
Merge pull request #106 from commit-0/justin/e2b
Add E2B execution backend
2 parents df0dc34 + 013f7a4 commit 4043a34

File tree

8 files changed

+150
-14
lines changed

8 files changed

+150
-14
lines changed

commit0/harness/build.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def main(
4545
repo_name = example["repo"].split("/")[-1]
4646
if split != "all" and repo_name not in SPLIT[split]:
4747
continue
48-
spec = make_spec(example, dataset_type)
48+
spec = make_spec(example, dataset_type, absolute=True)
4949
specs.append(spec)
5050

5151
client = docker.from_env()

commit0/harness/constants.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,11 @@ def items(self) -> ItemsView[str, object]:
6464
PASS_TO_FAIL = "PASS_TO_FAIL"
6565

6666
# Evaluation backends
67-
EVAL_BACKENDS = ["local", "modal"]
67+
EVAL_BACKENDS = ["local", "modal", "e2b"]
68+
# Use absolute for docker and modal. Backends with sudo access
69+
ABSOLUTE_REPO_DIR = "/testbed"
70+
# Use relative for e2b, with no sudo access
71+
RELATIVE_REPO_DIR = "testbed"
6872

6973
# available commands
7074
COMMANDS = [

commit0/harness/docker_build.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def build_base_images(
125125
126126
"""
127127
# Get the base images to build from the dataset
128-
test_specs = get_specs_from_dataset(dataset, dataset_type)
128+
test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True)
129129
base_images = {
130130
x.base_image_key: (x.base_dockerfile, x.platform) for x in test_specs
131131
}
@@ -166,7 +166,7 @@ def get_repo_configs_to_build(
166166
167167
"""
168168
image_scripts = dict()
169-
test_specs = get_specs_from_dataset(dataset, dataset_type)
169+
test_specs = get_specs_from_dataset(dataset, dataset_type, absolute=True)
170170

171171
for test_spec in test_specs:
172172
# Check if the base image exists

commit0/harness/execution_context.py

+68
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import modal
1111
import modal.io_streams
1212
from enum import auto
13+
from e2b_code_interpreter import Sandbox
1314
from strenum import StrEnum
1415
from pathlib import Path
1516
import time
@@ -33,6 +34,7 @@
3334
class ExecutionBackend(StrEnum):
3435
LOCAL = auto()
3536
MODAL = auto()
37+
E2B = auto()
3638

3739

3840
class ExecutionContext(ABC):
@@ -219,3 +221,69 @@ def __exit__(
219221
exctb: Optional[TracebackType],
220222
) -> None:
221223
close_logger(self.logger)
224+
225+
226+
class E2B(ExecutionContext):
227+
def __init__(
228+
self,
229+
spec: Spec,
230+
logger: logging.Logger,
231+
timeout: int,
232+
num_cpus: int,
233+
log_dir: Path,
234+
files_to_copy: Optional[Files] = None,
235+
files_to_collect: Optional[list[str]] = None,
236+
rebuild_image: bool = False,
237+
):
238+
super().__init__(
239+
spec,
240+
logger,
241+
timeout,
242+
num_cpus,
243+
log_dir,
244+
files_to_copy=files_to_copy,
245+
files_to_collect=files_to_collect,
246+
)
247+
248+
self.sb = Sandbox(timeout=timeout)
249+
self.sb.commands.run("curl -LsSf https://astral.sh/uv/install.sh | sh")
250+
251+
# setup sandbox env
252+
self.sb.files.write("setup.sh", spec.setup_script)
253+
self.sb.commands.run("bash setup.sh")
254+
255+
# prepare for eval
256+
if files_to_copy:
257+
for _, f in files_to_copy.items():
258+
with open(f["src"], "r") as fp: # type: ignore
259+
content = fp.read()
260+
self.sb.files.write(f["dest"].name, content) # type: ignore
261+
262+
def exec_run_with_timeout(self, command: str) -> tuple[str, bool, float]:
263+
"""Execute command on E2B sandbox
264+
For timeouts, we could maybe use the error code or check whether the
265+
sandbox is still alive.
266+
267+
The exit code is given by: result.exit_code
268+
269+
For now, we can just check if the sandbox is still alive.
270+
"""
271+
# TODO: setup timeout
272+
start_time = time.time()
273+
result = self.sb.commands.run(command, timeout=0)
274+
if self.files_to_collect is not None:
275+
for fname in self.files_to_collect:
276+
with (self.log_dir / fname).open("w") as f:
277+
f.write(self.sb.files.read(f"testbed/{fname}"))
278+
timed_out = self.sb.is_running()
279+
end_time = time.time()
280+
return result.stderr, timed_out, end_time - start_time
281+
282+
def __exit__(
283+
self,
284+
exctype: Optional[Type[BaseException]],
285+
excinst: Optional[BaseException],
286+
exctb: Optional[TracebackType],
287+
) -> None:
288+
self.sb.kill()
289+
close_logger(self.logger)

commit0/harness/run_pytest_ids.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
ExecutionBackend,
2727
Docker,
2828
Modal,
29+
E2B,
2930
)
3031

3132

@@ -52,6 +53,7 @@ def main(
5253
dataset_name, split=dataset_split
5354
) # type: ignore
5455
dataset_name = dataset_name.lower()
56+
absolute = backend != "e2b"
5557
spec = None
5658
example = None
5759
repo_name = None
@@ -76,7 +78,7 @@ def main(
7678
if repo_name in os.path.basename(repo_or_repo_dir) or repo_or_repo_dir.endswith(
7779
repo_name
7880
):
79-
spec = make_spec(example, dataset_type)
81+
spec = make_spec(example, dataset_type, absolute)
8082
break
8183
assert spec is not None, "No spec available"
8284
assert example is not None, "No example available"
@@ -187,19 +189,28 @@ def main(
187189

188190
backend = backend.upper()
189191
if ExecutionBackend(backend) == ExecutionBackend.MODAL:
190-
logger.info("Runnning on Modal")
192+
logger.info("Running on Modal")
191193
execution_context = Modal
192194
elif ExecutionBackend(backend) == ExecutionBackend.LOCAL:
193-
logger.info("Runnning locally")
195+
logger.info("Running locally")
194196
execution_context = Docker
197+
elif ExecutionBackend(backend) == ExecutionBackend.E2B:
198+
logger.info("Running E2B")
199+
execution_context = E2B
195200
else:
196201
raise ValueError(
197202
f"Evaluation must be from {', '.join(EVAL_BACKENDS)}, but {backend} is provided."
198203
)
199204

200205
files_to_copy = Files(
201-
eval_script={"src": eval_file, "dest": Path("/eval.sh")},
202-
patch={"src": patch_file, "dest": Path("/patch.diff")},
206+
eval_script={
207+
"src": eval_file,
208+
"dest": Path("/eval.sh" if absolute else "eval.sh"),
209+
},
210+
patch={
211+
"src": patch_file,
212+
"dest": Path("/patch.diff" if absolute else "patch.diff"),
213+
},
203214
)
204215
files_to_collect = [
205216
"report.json",
@@ -209,6 +220,11 @@ def main(
209220
if coverage:
210221
files_to_collect.append("coverage.json")
211222

223+
eval_command = (
224+
"/bin/bash /eval.sh"
225+
if ExecutionBackend(backend) != ExecutionBackend.E2B
226+
else "/bin/bash eval.sh"
227+
)
212228
try:
213229
with execution_context(
214230
spec,
@@ -221,7 +237,7 @@ def main(
221237
rebuild_image,
222238
) as context:
223239
output, timed_out, total_runtime = context.exec_run_with_timeout(
224-
"/bin/bash /eval.sh"
240+
eval_command
225241
)
226242
logger.info(output)
227243
if timed_out:

commit0/harness/spec.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from typing import Union, cast, Optional
55

66
from commit0.harness.constants import (
7+
ABSOLUTE_REPO_DIR,
8+
RELATIVE_REPO_DIR,
79
RepoInstance,
810
SimpleInstance,
911
)
@@ -17,6 +19,7 @@
1719
class Spec(ABC):
1820
"""A dataclass that represents a test specification for a single instance of SWE-bench."""
1921

22+
absolute: bool
2023
repo: str
2124
# repo dir on docker
2225
repo_directory: str
@@ -164,11 +167,12 @@ def make_repo_script_list(self) -> list[str]:
164167

165168
def make_eval_script_list(self) -> list[str]:
166169
"""Run the tests."""
170+
diff_path = "/patch.diff" if self.absolute else "../patch.diff"
167171
eval_script_list = [
168172
f"cd {self.repo_directory}",
169173
"source .venv/bin/activate",
170174
f"git reset --hard {self.instance['base_commit']}",
171-
"git apply --allow-empty -v /patch.diff",
175+
f"git apply --allow-empty -v {diff_path}",
172176
"git status",
173177
f"{self.instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors{{coverage}} {{test_ids}} > test_output.txt 2>&1",
174178
"echo $? > pytest_exit_code.txt",
@@ -306,39 +310,45 @@ def make_eval_script_list(self) -> list[str]:
306310
def get_specs_from_dataset(
307311
dataset: Union[list[Union[RepoInstance, SimpleInstance]], list[Spec]],
308312
dataset_type: str,
313+
absolute: bool,
309314
) -> list[Spec]:
310315
"""Idempotent function that converts a list of RepoInstance objects to a list of Spec objects."""
311316
if isinstance(dataset[0], Spec):
312317
return cast(list[Spec], dataset)
313318
return list(
314319
map(
315-
lambda instance: make_spec(instance, dataset_type),
320+
lambda instance: make_spec(instance, dataset_type, absolute),
316321
cast(list["RepoInstance"], dataset),
317322
)
318323
)
319324

320325

321-
def make_spec(instance: Union[RepoInstance, SimpleInstance], dataset_type: str) -> Spec:
326+
def make_spec(
327+
instance: Union[RepoInstance, SimpleInstance], dataset_type: str, absolute: bool
328+
) -> Spec:
329+
repo_directory = ABSOLUTE_REPO_DIR if absolute else RELATIVE_REPO_DIR
322330
if isinstance(instance, Spec):
323331
return instance
324-
repo_directory = "/testbed"
325332
if dataset_type == "commit0":
326333
return Commit0Spec(
327334
repo=instance["instance_id"],
328335
repo_directory=repo_directory,
329336
instance=instance,
337+
absolute=absolute,
330338
)
331339
elif dataset_type == "swebench":
332340
return SWEBenchSpec(
333341
repo=instance["instance_id"],
334342
repo_directory=repo_directory,
335343
instance=instance,
344+
absolute=absolute,
336345
)
337346
elif dataset_type == "simple":
338347
return SimpleSpec(
339348
repo="simple", # all benchmarks with mere function writing will share the simple docker image
340349
repo_directory=repo_directory,
341350
instance=instance,
351+
absolute=absolute,
342352
)
343353
else:
344354
raise NotImplementedError(

pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ dependencies = [
2020
"datasets==3.0.1",
2121
"modal>=0.66.26",
2222
"strenum>=0.4.15",
23+
"e2b-code-interpreter>=1.0.4",
24+
"python-dotenv>=1.0.1",
2325
]
2426
classifiers = [
2527
"License :: OSI Approved :: MIT License",

uv.lock

+36
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)