Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions debug_gym/agents/solution_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,14 @@ def execute_action(self, llm_response, **kwargs):
return info

def init(self, info: EnvInfo) -> None:
if self.env.has_tool("eval"):
tool_call = ToolCall(name="eval", id="eval", arguments={})
info = self.env.step(tool_call, None, None)
assert (
info.resolved is False
), "Eval tool should not resolve before applying the gold patch."
assert (
info.score < info.max_score
), "Score should be less than max score before applying the gold patch."

self._run_pdb_sanity_checks(info)
8 changes: 6 additions & 2 deletions debug_gym/gym/envs/swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,13 @@ def setup_terminal(self):
self.terminal.run(
"pip install httpbin[mainapp]==0.10.2 pytest-httpbin==2.1.0"
)
self.terminal.run("nohup gunicorn -b 127.0.0.1:80 -k gevent httpbin:app &")
self.terminal.run(
"nohup gunicorn -b 127.0.0.1:443 --certfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.pem --keyfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.key -k gevent httpbin:app &"
"nohup gunicorn -b 127.0.0.1:80 -k gevent httpbin:app &",
background=True,
)
self.terminal.run(
"nohup gunicorn -b 127.0.0.1:443 --certfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.pem --keyfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.key -k gevent httpbin:app &",
background=True,
)
self.terminal.run('echo "127.0.0.1 httpbin.org" >> /etc/hosts')
elif self.task_name == "pylint-dev__pylint-4661":
Expand Down
10 changes: 9 additions & 1 deletion debug_gym/gym/terminals/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def prepare_command(
entrypoint_str = " && ".join(entrypoint)

# Wrap with timeout command if specified
if timeout is not None:
if timeout:
# Use timeout command to kill the process if it exceeds the limit
# Exit code 124 indicates timeout was reached
entrypoint_str = (
Expand All @@ -165,6 +165,7 @@ def run(
timeout: int = None,
raises: bool = False,
strip_output: bool = True,
background: bool = False,
) -> tuple[bool, str]:
"""Run a command in the terminal. Return command status and output.

Expand All @@ -173,8 +174,10 @@ def run(
timeout: Timeout in seconds for this command. If the command exceeds this
time, it will be killed and the method returns (False, timeout_message).
If None, uses self.command_timeout.
If explicitly set to 0 it will disable the timeout.
raises: If True, raise ValueError on command failure.
strip_output: If True, strip trailing newlines from output.
background: If True, run the command in the background.

Returns:
Tuple of (success, output). Success is False if command failed or timed out.
Expand Down Expand Up @@ -208,6 +211,11 @@ def run(
if strip_output:
output = output.strip("\r\n").strip("\n")

if background:
# In background mode, we don't wait for command completion
self.logger.debug(f"[{self.container.name}] Command running in background.")
return True, f"Command running in background. Initial output: {output}"

# Check for timeout (exit code 124 from the timeout command)
if status == 124:
self.logger.warning(
Expand Down
16 changes: 14 additions & 2 deletions debug_gym/gym/terminals/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ def prepare_command(
command = f"{env_prefix}{command}"

# Wrap with timeout command if specified
if timeout is not None:
if timeout:
# Use timeout command to kill the process if it exceeds the limit
# Exit code 124 indicates timeout was reached
command = f"timeout {timeout} /bin/bash -c {shlex.quote(command)}"
Expand All @@ -516,6 +516,7 @@ def run(
timeout: int = None,
raises: bool = False,
strip_output: bool = True,
background: bool = False,
) -> tuple[bool, str]:
"""Run a command in the pod. Return command status and output.

Expand All @@ -524,9 +525,10 @@ def run(
timeout: Timeout in seconds for this command. If the command exceeds this
time, it will be killed and the method returns (False, timeout_message).
If None, uses self.command_timeout.
If explicitly set to 0 it will disable the timeout.
raises: If True, raise ValueError on command failure.
strip_output: If True, strip trailing newlines from output.

background: If True, run the command in the background.
Returns:
Tuple of (success, output). Success is False if command failed or timed out.
"""
Expand Down Expand Up @@ -564,6 +566,16 @@ def run(
if resp.peek_stderr():
output += resp.read_stderr()

if background:
# In background mode, we don't wait for command completion
self.logger.debug(
f"[{self.pod.name}] Command running in background."
)
return (
True,
f"Command running in background. Initial output: {output}",
)

# Get the exit code
error_channel = resp.read_channel(ERROR_CHANNEL) # Error channel
self.logger.debug(f"[{self.pod.name}] error channel: {error_channel}")
Expand Down
5 changes: 5 additions & 0 deletions debug_gym/gym/terminals/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def run(
timeout: int = None,
raises: bool = False,
strip_output: bool = True,
background: bool = False,
) -> tuple[bool, str]:
"""Run a list of commands in the terminal. Return command status and output.

Expand All @@ -82,6 +83,7 @@ def run(
If None, uses self.command_timeout.
raises: If True, raise ValueError on command failure.
strip_output: If True, strip trailing newlines from output.
background: If True, run the command in the background.

Returns:
Tuple of (success, output). Success is False if command failed or timed out.
Expand All @@ -101,6 +103,9 @@ def run(
text=True,
)
try:
assert (
background is False
), "TODO: Background execution is not yet supported for LocalTerminal."
stdout, stderr = process.communicate(timeout=effective_timeout)
success = process.returncode == 0
except subprocess.TimeoutExpired:
Expand Down
Loading