All-Hands-AI · tofarr · Feb 3, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/evaluation/benchmarks/EDA/run_infer.py b/evaluation/benchmarks/EDA/run_infer.py
@@ -69,6 +69,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=False,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -53,6 +53,7 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -61,6 +61,7 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=1800,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
@@ -67,6 +67,7 @@ def get_config(
             base_container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
@@ -80,6 +80,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/browsing_delegation/run_infer.py b/evaluation/benchmarks/browsing_delegation/run_infer.py
@@ -45,6 +45,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=False,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         workspace_base=None,
         workspace_mount_path=None,

diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -135,6 +135,7 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/discoverybench/run_infer.py b/evaluation/benchmarks/discoverybench/run_infer.py
@@ -71,6 +71,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
@@ -56,6 +56,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/gorilla/run_infer.py b/evaluation/benchmarks/gorilla/run_infer.py
@@ -49,6 +49,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/gpqa/run_infer.py b/evaluation/benchmarks/gpqa/run_infer.py
@@ -70,6 +70,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -91,6 +91,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/logic_reasoning/run_infer.py b/evaluation/benchmarks/logic_reasoning/run_infer.py
@@ -55,6 +55,7 @@ def get_config(
             enable_auto_lint=True,
             use_host_network=False,
             runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/miniwob/run_infer.py b/evaluation/benchmarks/miniwob/run_infer.py
@@ -70,6 +70,7 @@ def get_config(
             remote_runtime_init_timeout=1800,
             keep_runtime_alive=False,
             timeout=120,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
@@ -113,6 +113,7 @@ def get_config(
             enable_auto_lint=True,
             use_host_network=False,
             runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -73,6 +73,7 @@ def get_config(
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -144,6 +144,7 @@ def get_config(
                 dataset_name=metadata.dataset,
                 instance_id=instance['instance_id'],
             ),
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -50,6 +50,7 @@ def get_config(
             # large enough timeout, since some testcases take very long to run
             timeout=300,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_enable_retries=True,
         ),
         # we mount trajectories path so that trajectories, generated by OpenHands
         # controller, can be accessible to the evaluator file in the runtime container

diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py
@@ -50,6 +50,7 @@ def get_config(
             base_container_image='python:3.12-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/visualwebarena/run_infer.py b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -79,6 +79,7 @@ def get_config(
                 'VWA_HOMEPAGE': f'{base_url}:4399',
             },
             timeout=300,
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/benchmarks/webarena/run_infer.py b/evaluation/benchmarks/webarena/run_infer.py
@@ -71,6 +71,7 @@ def get_config(
                 'MAP': f'{base_url}:3000',
                 'HOMEPAGE': f'{base_url}:4399',
             },
+            remote_runtime_enable_retries=True,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/openhands/core/config/sandbox_config.py b/openhands/core/config/sandbox_config.py
@@ -51,6 +51,7 @@ class SandboxConfig(BaseModel):
     timeout: int = Field(default=120)
     remote_runtime_init_timeout: int = Field(default=180)
     remote_runtime_api_timeout: int = Field(default=10)
+    remote_runtime_enable_retries: bool = Field(default=False)
     enable_auto_lint: bool = Field(
         default=False  # once enabled, OpenHands would lint files after editing
     )

diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
@@ -291,7 +291,7 @@ def _wait_until_alive(self):
             stop=tenacity.stop_after_delay(
                 self.config.sandbox.remote_runtime_init_timeout
             )
-            | stop_if_should_exit(),
+            | stop_if_should_exit() | self._stop_if_closed,
             reraise=True,
             retry=tenacity.retry_if_exception_type(AgentRuntimeNotReadyError),
             wait=tenacity.wait_fixed(2),
@@ -388,12 +388,18 @@ def _send_runtime_api_request(self, method, url, **kwargs):
             )
             raise
 
-    @tenacity.retry(
-        retry=tenacity.retry_if_exception_type(ConnectionError),
-        stop=tenacity.stop_after_attempt(3) | stop_if_should_exit(),
-        wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
-    )
     def _send_action_server_request(self, method, url, **kwargs):
+        if not self.config.sandbox.remote_runtime_enable_retries:
+            return self._send_action_server_request(method, url, **kwargs)
+
+        retry_decorator = tenacity.retry(
+            retry=tenacity.retry_if_exception_type(ConnectionError),
+            stop=tenacity.stop_after_attempt(3) | stop_if_should_exit() | self._stop_if_closed,
+            wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
+        )
+        return retry_decorator(self._send_action_server_request_impl)(method, url, **kwargs)
+
+    def _send_action_server_request_impl(self, method, url, **kwargs):
         try:
             return super()._send_action_server_request(method, url, **kwargs)
         except requests.Timeout:
@@ -424,3 +430,6 @@ def _send_action_server_request(self, method, url, **kwargs):
                     ) from e
             else:
                 raise e
+
+    def _stop_if_closed(self, retry_state: tenacity.RetryCallState) -> bool:
+        return self._runtime_closed
diff --git a/openhands/server/conversation_manager/standalone_conversation_manager.py b/openhands/server/conversation_manager/standalone_conversation_manager.py
@@ -150,7 +150,7 @@ async def _cleanup_stale(self):
                 )
                 return
             except Exception as e:
-                logger.warning(f'error_cleaning_stale: {str(e)}')
+                logger.error(f'error_cleaning_stale')
                 await asyncio.sleep(_CLEANUP_INTERVAL)
 
     async def get_running_agent_loops(