microsoft · MarcCote · Jan 7, 2026 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/README.md b/README.md
@@ -161,6 +161,42 @@ Terminal selection is configured through the `terminal_config` in your script co
 
 ---
 
+#### 2.5. Timeouts
+
+`debug-gym` provides several timeout mechanisms to ensure agents don't hang indefinitely:
+
+| Timeout Type | Description | Default | Configuration |
+| :-: | :----- | :-: | :----- |
+| **Command Timeout** | Maximum time for a single command (e.g., `bash`, `eval`) to execute. Prevents blocking commands like `serve_forever()` or infinite loops from hanging the agent. | 300s (5 min) | `terminal.command_timeout` |
+| **Run Timeout** | Maximum time for a single eval/run (e.g., pytest execution). | 300s (5 min) | `env.run_timeout` |
+| **Agent Step Timeout** | Maximum time for the LLM to generate a response. | varies | LLM provider settings |
+| **Session Lifetime** | Total time an agent can interact with the environment. | unlimited | Application-level |
+
+**Command Timeout** is particularly important for exploration agents that might accidentally run blocking scripts. When a command times out, it returns `(False, "Command timed out after X seconds")` with any partial output.
+
+Example terminal configuration with custom timeout:
+
+```yaml
+terminal:
+  type: docker
+  command_timeout: 300  # 5 minutes per command (default: 600)
+```
+
+For Kubernetes deployments:
+
+```yaml
+terminal:
+  type: kubernetes
+  command_timeout: 900  # 15 minutes for longer-running tests
+  namespace: debug-gym
+  base_image: your-image:tag
+```
+
+> [!TIP]
+> If your agent runs `eval` or `submit` tools that execute large test suites, consider increasing `command_timeout` to accommodate longer test runs.
+
+---
+
 ## 3. Running Baselines
 We use `.yaml` files to specify configurations. Example config files can be found in `configs/`. To run an agent:
 

diff --git a/analysis/json_log_viewer/README.md b/analysis/json_log_viewer/README.md
@@ -0,0 +1,40 @@
+# JSON Log Viewer
+
+A Flask-based web viewer for debug-gym trajectory JSON files. Visualize agent exploration sessions with step-by-step action breakdowns.
+
+## Installation
+
+```bash
+cd analysis/json_log_viewer
+pip install -r requirements.txt
+```
+
+## Usage
+
+Start the server:
+
+```bash
+python json_log_viewer.py -p 5050
+```
+
+Then open http://127.0.0.1:5050 in your browser.
+
+### Loading Trajectories
+
+You can load trajectory files in several ways:
+
+1. **Upload**: Click "Upload" and select a JSON file
+2. **Browse**: Click "Browse Files" to navigate your filesystem
+3. **API**: Load programmatically via `GET /load_file_from_path?path=/path/to/trajectory.json`
+
+### Integration with Gray Tree Frog
+
+The viewer supports CORS requests, allowing Gray Tree Frog's lineage visualization to open trajectories directly. When viewing the lineage graph, click "View trajectory" on any discovery to open its exploration session.
+
+## Features
+
+- Step-by-step trajectory visualization
+- Color-coded action types (bash, view, edit, etc.)
+- Detailed bash command classification
+- Statistics view showing action distribution
+- Keyboard navigation between steps
diff --git a/analysis/json_log_viewer/json_log_viewer.py b/analysis/json_log_viewer/json_log_viewer.py
@@ -5,6 +5,7 @@
 import shlex
 
 from flask import Flask, jsonify, redirect, render_template, request, url_for
+from flask_cors import cross_origin
 from werkzeug.utils import secure_filename
 
 app = Flask(__name__)
@@ -581,6 +582,7 @@ def browse_directory():
 
 
 @app.route("/load_file_from_path")
+@cross_origin()  # Allow cross-origin requests (for Gray Tree Frog visualization)
 def load_file_from_path():
     """Load a JSON file from a specific path"""
     global data, current_file

diff --git a/analysis/json_log_viewer/requirements.txt b/analysis/json_log_viewer/requirements.txt
@@ -0,0 +1,3 @@
+flask
+flask-cors
+werkzeug
diff --git a/configs/free_env.yaml b/configs/free_env.yaml
@@ -20,6 +20,7 @@ task_data:
 
 terminal:
   type: docker
+  # command_timeout: 300  # Max time (seconds) for a single command (default: 300 = 5 min)
 
 agent:
   type: froggy

diff --git a/configs/r2egym.yaml b/configs/r2egym.yaml
@@ -23,6 +23,7 @@ dataset:
 
 terminal:
   type: docker
+  # command_timeout: 300  # Max time (seconds) for a single command (default: 300 = 5 min)
 
 agent:
   type: froggy

diff --git a/configs/simple.yaml b/configs/simple.yaml
@@ -19,6 +19,7 @@ task_data:
 
 terminal:
   type: docker
+  # command_timeout: 300  # Max time (seconds) for a single command (default: 300 = 5 min)
 
 agent:
   type: simple_agent

diff --git a/configs/swebench.yaml b/configs/swebench.yaml
@@ -25,6 +25,7 @@ dataset:
 
 terminal:
   type: docker
+  # command_timeout: 300  # Max time (seconds) for a single command (default: 300 = 5 min)
 
 agent:
   type: froggy

diff --git a/configs/swesmith.yaml b/configs/swesmith.yaml
@@ -22,6 +22,7 @@ dataset:
 
 terminal:
   type: docker
+  # command_timeout: 300  # Max time (seconds) for a single command (default: 300 = 5 min)
 
 agent:
   type: froggy

diff --git a/debug_gym/gym/terminals/docker.py b/debug_gym/gym/terminals/docker.py
@@ -1,5 +1,6 @@
 import atexit
 import os
+import shlex
 import tarfile
 import uuid
 from io import BytesIO
@@ -29,16 +30,25 @@ def __init__(
         base_image: str | None = None,
         registry: str = "",
         setup_commands: list[str] | None = None,
+        command_timeout: int = 300,
         **kwargs,
     ):
         """
-        volumes (dict or list): A dictionary to configure volumes mounted
-                inside the container. The key is either the host path or a
-                volume name, and the value is a dictionary with the keys:
-
-                - ``bind`` The path to mount the volume inside the container
-                - ``mode`` Either ``rw`` to mount the volume read/write, or
-                  ``ro`` to mount it read-only.
+        Args:
+            working_dir: Working directory inside the container.
+            session_commands: Commands to run at the start of each session.
+            env_vars: Environment variables to set in the container.
+            logger: Logger instance.
+            base_image: Docker image to use.
+            registry: Docker registry URL.
+            setup_commands: Commands to run once when setting up the container.
+            command_timeout: Default timeout in seconds for individual command execution
+                (default: 300 = 5 minutes). This is NOT the terminal session lifetime.
+                Commands that exceed this timeout will be killed. Can be configured via YAML:
+                    terminal_config:
+                        type: docker
+                        command_timeout: 60
+            **kwargs: Additional arguments (ignored with debug log).
         """
         super().__init__(
             working_dir=working_dir,
@@ -50,9 +60,17 @@ def __init__(
         self.base_image = base_image
         self.registry = registry.rstrip("/") + "/" if registry else ""
         self.setup_commands = setup_commands or []
-        self.docker_client = docker.from_env(timeout=600)
+        self.command_timeout = command_timeout
+        self._docker_client = None  # Lazily initialized
         self._container = None
 
+    @property
+    def docker_client(self):
+        """Lazy initialization of Docker client."""
+        if self._docker_client is None:
+            self._docker_client = docker.from_env(timeout=600)
+        return self._docker_client
+
     def _ensure_container_running(self):
         """Verify that the container exists and is running."""
         container = self.container
@@ -111,15 +129,34 @@ def new_shell_session(self):
         self.sessions.append(session)
         return session
 
-    def prepare_command(self, entrypoint: str | list[str]) -> list[str]:
+    def prepare_command(
+        self, entrypoint: str | list[str], timeout: int | None = None
+    ) -> list[str]:
         """Prepares a shell command by combining session commands and entrypoint commands.
-        Then wraps the command in a shell call."""
+        Then wraps the command in a shell call with optional timeout.
+
+        Args:
+            entrypoint: Command(s) to run.
+            timeout: Optional timeout in seconds. If provided, the command is wrapped
+                with the Unix `timeout` command to ensure it doesn't block forever.
+        """
         if isinstance(entrypoint, str):
             entrypoint = [entrypoint]
         if self.session_commands:
             entrypoint = self.session_commands + entrypoint
-        entrypoint = " && ".join(entrypoint)
-        command = ["/bin/bash", "-c", entrypoint]
+        entrypoint_str = " && ".join(entrypoint)
+
+        # Wrap with timeout command if specified
+        if timeout is not None:
+            # Use timeout command to kill the process if it exceeds the limit
+            # Exit code 124 indicates timeout was reached
+            entrypoint_str = (
+                f"timeout {timeout} /bin/bash -c {shlex.quote(entrypoint_str)}"
+            )
+            command = ["/bin/bash", "-c", entrypoint_str]
+        else:
+            command = ["/bin/bash", "-c", entrypoint_str]
+
         return command
 
     def run(
@@ -129,14 +166,27 @@ def run(
         raises: bool = False,
         strip_output: bool = True,
     ) -> tuple[bool, str]:
-        """Run a command in the terminal. Return command status and output."""
-        command = self.prepare_command(entrypoint)
+        """Run a command in the terminal. Return command status and output.
+
+        Args:
+            entrypoint: Command(s) to run.
+            timeout: Timeout in seconds for this command. If the command exceeds this
+                time, it will be killed and the method returns (False, timeout_message).
+                If None, uses self.command_timeout.
+            raises: If True, raise ValueError on command failure.
+            strip_output: If True, strip trailing newlines from output.
+
+        Returns:
+            Tuple of (success, output). Success is False if command failed or timed out.
+        """
+        # Use command_timeout if not specified per-call
+        effective_timeout = timeout if timeout is not None else self.command_timeout
+        command = self.prepare_command(entrypoint, timeout=effective_timeout)
 
-        self.logger.debug(f"Exec run: {command}")
+        self.logger.debug(f"Exec run (timeout={effective_timeout}s): {command}")
 
         self._ensure_container_running()
 
-        # TODO: docker exec_run timeout?
         try:
             status, output = self.container.exec_run(
                 command,
@@ -153,12 +203,25 @@ def run(
             raise UnrecoverableTerminalError(
                 "Docker exec failed due to an unexpected container error."
             ) from exc
-        success = status == 0
 
         output = output.decode()
         if strip_output:
             output = output.strip("\r\n").strip("\n")
 
+        # Check for timeout (exit code 124 from the timeout command)
+        if status == 124:
+            self.logger.warning(
+                f"Command timed out after {effective_timeout}s: {entrypoint}"
+            )
+            timeout_msg = f"Command timed out after {effective_timeout} seconds"
+            if output:
+                output = f"{timeout_msg}\nPartial output:\n{output}"
+            else:
+                output = timeout_msg
+            return False, output
+
+        success = status == 0
+
         if raises and not success:
             # Command includes the entrypoint + session commands
             self.logger.debug(f"Failed to run command `{command}`:\n{output}")
@@ -244,6 +307,13 @@ def clean_up(self):
     def close(self):
         super().close()
         self.clean_up()
+        # Close the Docker client to release connection pool resources
+        if self._docker_client is not None:
+            try:
+                self._docker_client.close()
+            except Exception as exc:
+                self.logger.debug(f"Failed to close Docker client: {exc}")
+            self._docker_client = None
 
     def __str__(self):
         return f"DockerTerminal[{self.container}, {self.working_dir}]"