Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[eval] increase timeout for SWEBench eval init/complete #3829

Merged
merged 7 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,14 @@ def initialize_runtime(
action = CmdRunAction(
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
)
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down Expand Up @@ -201,18 +203,21 @@ def initialize_runtime(
'/swe_util/',
)
action = CmdRunAction(command='cat ~/.bashrc')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(command='source ~/.bashrc')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -234,6 +239,7 @@ def initialize_runtime(
assert obs.exit_code == 0

action = CmdRunAction(command='git reset --hard')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -242,6 +248,7 @@ def initialize_runtime(
action = CmdRunAction(
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
)
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down Expand Up @@ -269,18 +276,21 @@ def complete_runtime(
workspace_dir_name = _get_swebench_workspace_dir_name(instance)

action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(command='git config --global core.pager ""')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(command='git add -A')
action.timeout = 600
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down
4 changes: 4 additions & 0 deletions openhands/events/action/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
class CmdRunAction(Action):
command: str
thought: str = ''
blocking: bool = False
# If False, the command will be run in a non-blocking / interactive way
# The partial command outputs will be returned as output observation.
# If True, the command will be run for max .timeout seconds.
keep_prompt: bool = True
# if True, the command prompt will be kept in the command output observation
# Example of command output:
Expand Down
5 changes: 5 additions & 0 deletions openhands/events/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,8 @@ def timeout(self) -> int | None:
@timeout.setter
def timeout(self, value: int | None) -> None:
self._timeout = value

# Check if .blocking is an attribute of the event
if hasattr(self, 'blocking'):
# .blocking needs to be set to True if .timeout is set
self.blocking = True
6 changes: 4 additions & 2 deletions openhands/runtime/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,11 @@ async def run(self, action: CmdRunAction) -> CmdOutputObservation:
else:
output, exit_code = self._execute_bash(
command,
timeout=SOFT_TIMEOUT_SECONDS,
timeout=SOFT_TIMEOUT_SECONDS
if not action.blocking
else action.timeout,
keep_prompt=action.keep_prompt,
kill_on_timeout=False,
kill_on_timeout=False if not action.blocking else True,
)
if all_output:
# previous output already exists with prompt "user@hostname:working_dir #""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software architect. Your team has inherited an existing codebase, and
need to finish a project:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software architect. Your team has inherited an existing codebase, and
need to finish a project:
Expand Down Expand Up @@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
actions and observations--more may have happened before that.
They are time-ordered, with your most recent action at the bottom.

[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]

## Format
Your response MUST be in JSON format. It must be an object, and it must contain two fields:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software architect. Your team has inherited an existing codebase, and
need to finish a project:
Expand Down Expand Up @@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
actions and observations--more may have happened before that.
They are time-ordered, with your most recent action at the bottom.

[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]

## Format
Your response MUST be in JSON format. It must be an object, and it must contain two fields:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software engineer. You've inherited an existing codebase, which you
need to modify to complete this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software engineer. You've inherited an existing codebase, which you
need to modify to complete this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software engineer. You've inherited an existing codebase, which you
need to modify to complete this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a quality assurance engineer. Another engineer has made changes to the
codebase which are supposed to solve this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a quality assurance engineer. Another engineer has made changes to the
codebase which are supposed to solve this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software architect. Your team has inherited an existing codebase, and
need to finish a project:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software architect. Your team has inherited an existing codebase, and
need to finish a project:
Expand Down Expand Up @@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
actions and observations--more may have happened before that.
They are time-ordered, with your most recent action at the bottom.

[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]

## Format
Your response MUST be in JSON format. It must be an object, and it must contain two fields:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software engineer. You've inherited an existing codebase, which you
need to modify to complete this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software engineer. You've inherited an existing codebase, which you
need to modify to complete this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software engineer. You've inherited an existing codebase, which you
need to modify to complete this task:
Expand Down Expand Up @@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
actions and observations--more may have happened before that.
They are time-ordered, with your most recent action at the bottom.

[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]

## Format
Your response MUST be in JSON format. It must be an object, and it must contain two fields:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a software engineer. You've inherited an existing codebase, which you
need to modify to complete this task:
Expand Down Expand Up @@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
actions and observations--more may have happened before that.
They are time-ordered, with your most recent action at the bottom.

[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]

## Format
Your response MUST be in JSON format. It must be an object, and it must contain two fields:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a quality assurance engineer. Another engineer has made changes to the
codebase which are supposed to solve this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a quality assurance engineer. Another engineer has made changes to the
codebase which are supposed to solve this task:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a quality assurance engineer. Another engineer has made changes to the
codebase which are supposed to solve this task:
Expand Down Expand Up @@ -39,7 +35,7 @@ as well as observations you've made. This only includes the MOST RECENT
actions and observations--more may have happened before that.
They are time-ordered, with your most recent action at the bottom.

[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]

## Format
Your response MUST be in JSON format. It must be an object, and it must contain two fields:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are in charge of accomplishing the following task:
Write a git commit message for the current staging area. Do not ask me for confirmation at any point.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a responsible software engineer and always write good commit messages.

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@


----------

# Task
You are a responsible software engineer and always write good commit messages.

Expand All @@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
actions and observations--more may have happened before that.
They are time-ordered, with your most recent action at the bottom.

[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]

If the last item in the history is an error, you should try to fix it.

Expand Down
Loading
Loading