diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py index 87f97317..f0e6c6ad 100644 --- a/debug_gym/agents/solution_agent.py +++ b/debug_gym/agents/solution_agent.py @@ -63,4 +63,14 @@ def execute_action(self, llm_response, **kwargs): return info def init(self, info: EnvInfo) -> None: + if self.env.has_tool("eval"): + tool_call = ToolCall(name="eval", id="eval", arguments={}) + info = self.env.step(tool_call, None, None) + assert ( + info.resolved is False + ), "Eval tool should not resolve before applying the gold patch." + assert ( + info.score < info.max_score + ), "Score should be less than max score before applying the gold patch." + self._run_pdb_sanity_checks(info) diff --git a/debug_gym/gym/envs/swe_bench.py b/debug_gym/gym/envs/swe_bench.py index f0341281..48bcfba0 100644 --- a/debug_gym/gym/envs/swe_bench.py +++ b/debug_gym/gym/envs/swe_bench.py @@ -114,9 +114,13 @@ def setup_terminal(self): self.terminal.run( "pip install httpbin[mainapp]==0.10.2 pytest-httpbin==2.1.0" ) - self.terminal.run("nohup gunicorn -b 127.0.0.1:80 -k gevent httpbin:app &") self.terminal.run( - "nohup gunicorn -b 127.0.0.1:443 --certfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.pem --keyfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.key -k gevent httpbin:app &" + "nohup gunicorn -b 127.0.0.1:80 -k gevent httpbin:app &", + background=True, + ) + self.terminal.run( + "nohup gunicorn -b 127.0.0.1:443 --certfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.pem --keyfile=/opt/miniconda3/envs/testbed/lib/python3.9/site-packages/pytest_httpbin/certs/server.key -k gevent httpbin:app &", + background=True, ) self.terminal.run('echo "127.0.0.1 httpbin.org" >> /etc/hosts') elif self.task_name == "pylint-dev__pylint-4661": diff --git a/debug_gym/gym/terminals/docker.py b/debug_gym/gym/terminals/docker.py index c3d0a6b8..386d9e8e 100644 --- a/debug_gym/gym/terminals/docker.py +++ b/debug_gym/gym/terminals/docker.py @@ -147,7 +147,7 @@ def prepare_command( entrypoint_str = " && ".join(entrypoint) # Wrap with timeout command if specified - if timeout is not None: + if timeout: # Use timeout command to kill the process if it exceeds the limit # Exit code 124 indicates timeout was reached entrypoint_str = ( @@ -165,6 +165,7 @@ def run( timeout: int = None, raises: bool = False, strip_output: bool = True, + background: bool = False, ) -> tuple[bool, str]: """Run a command in the terminal. Return command status and output. @@ -173,8 +174,10 @@ def run( timeout: Timeout in seconds for this command. If the command exceeds this time, it will be killed and the method returns (False, timeout_message). If None, uses self.command_timeout. + If explicitly set to 0 it will disable the timeout. raises: If True, raise ValueError on command failure. strip_output: If True, strip trailing newlines from output. + background: If True, run the command in the background. Returns: Tuple of (success, output). Success is False if command failed or timed out. @@ -208,6 +211,11 @@ def run( if strip_output: output = output.strip("\r\n").strip("\n") + if background: + # In background mode, we don't wait for command completion + self.logger.debug(f"[{self.container.name}] Command running in background.") + return True, f"Command running in background. Initial output: {output}" + # Check for timeout (exit code 124 from the timeout command) if status == 124: self.logger.warning( diff --git a/debug_gym/gym/terminals/kubernetes.py b/debug_gym/gym/terminals/kubernetes.py index ddddf786..7f6f2ea0 100644 --- a/debug_gym/gym/terminals/kubernetes.py +++ b/debug_gym/gym/terminals/kubernetes.py @@ -503,7 +503,7 @@ def prepare_command( command = f"{env_prefix}{command}" # Wrap with timeout command if specified - if timeout is not None: + if timeout: # Use timeout command to kill the process if it exceeds the limit # Exit code 124 indicates timeout was reached command = f"timeout {timeout} /bin/bash -c {shlex.quote(command)}" @@ -516,6 +516,7 @@ def run( timeout: int = None, raises: bool = False, strip_output: bool = True, + background: bool = False, ) -> tuple[bool, str]: """Run a command in the pod. Return command status and output. @@ -524,9 +525,10 @@ def run( timeout: Timeout in seconds for this command. If the command exceeds this time, it will be killed and the method returns (False, timeout_message). If None, uses self.command_timeout. + If explicitly set to 0 it will disable the timeout. raises: If True, raise ValueError on command failure. strip_output: If True, strip trailing newlines from output. - + background: If True, run the command in the background. Returns: Tuple of (success, output). Success is False if command failed or timed out. """ @@ -564,6 +566,16 @@ def run( if resp.peek_stderr(): output += resp.read_stderr() + if background: + # In background mode, we don't wait for command completion + self.logger.debug( + f"[{self.pod.name}] Command running in background." + ) + return ( + True, + f"Command running in background. Initial output: {output}", + ) + # Get the exit code error_channel = resp.read_channel(ERROR_CHANNEL) # Error channel self.logger.debug(f"[{self.pod.name}] error channel: {error_channel}") diff --git a/debug_gym/gym/terminals/local.py b/debug_gym/gym/terminals/local.py index fca2635a..f05af633 100644 --- a/debug_gym/gym/terminals/local.py +++ b/debug_gym/gym/terminals/local.py @@ -72,6 +72,7 @@ def run( timeout: int = None, raises: bool = False, strip_output: bool = True, + background: bool = False, ) -> tuple[bool, str]: """Run a list of commands in the terminal. Return command status and output. @@ -82,6 +83,7 @@ def run( If None, uses self.command_timeout. raises: If True, raise ValueError on command failure. strip_output: If True, strip trailing newlines from output. + background: If True, run the command in the background. Returns: Tuple of (success, output). Success is False if command failed or timed out. @@ -101,6 +103,9 @@ def run( text=True, ) try: + assert ( + background is False + ), "TODO: Background execution is not yet supported for LocalTerminal." stdout, stderr = process.communicate(timeout=effective_timeout) success = process.returncode == 0 except subprocess.TimeoutExpired: