kjain14 · kjain14 · Dec 6, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 30, 2024
diff --git a/.github/workflows/fe-unit-tests.yml b/.github/workflows/fe-unit-tests.yml
@@ -35,6 +35,9 @@ jobs:
       - name: Install dependencies
         working-directory: ./frontend
         run: npm ci
+      - name: Run TypeScript compilation
+        working-directory: ./frontend
+        run: npm run make-i18n && tsc
       - name: Run tests and collect coverage
         working-directory: ./frontend
         run: npm run test:coverage

diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml
@@ -291,7 +291,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
@@ -368,7 +368,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:

diff --git a/.github/workflows/lint-fix.yml b/.github/workflows/lint-fix.yml
@@ -5,9 +5,10 @@ on:
     types: [labeled]
 
 jobs:
-  lint-fix:
+  # Frontend lint fixes
+  lint-fix-frontend:
     if: github.event.label.name == 'lint-fix'
-    name: Fix linting issues
+    name: Fix frontend linting issues
     runs-on: ubuntu-latest
     permissions:
       contents: write
@@ -20,7 +21,6 @@ jobs:
           fetch-depth: 0
           token: ${{ secrets.GITHUB_TOKEN }}
 
-      # Frontend lint fixes
       - name: Install Node.js 20
         uses: actions/setup-node@v4
         with:
@@ -34,7 +34,36 @@ jobs:
           cd frontend
           npm run lint:fix
 
-      # Python lint fixes
+      # Commit and push changes if any
+      - name: Check for changes
+        id: git-check
+        run: |
+          git diff --quiet || echo "changes=true" >> $GITHUB_OUTPUT
+      - name: Commit and push if there are changes
+        if: steps.git-check.outputs.changes == 'true'
+        run: |
+          git config --local user.email "openhands@all-hands.dev"
+          git config --local user.name "OpenHands Bot"
+          git add -A
+          git commit -m "🤖 Auto-fix frontend linting issues"
+          git push
+
+  # Python lint fixes
+  lint-fix-python:
+    if: github.event.label.name == 'lint-fix'
+    name: Fix Python linting issues
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Set up python
         uses: actions/setup-python@v5
         with:
@@ -58,5 +87,5 @@ jobs:
           git config --local user.email "openhands@all-hands.dev"
           git config --local user.name "OpenHands Bot"
           git add -A
-          git commit -m "🤖 Auto-fix linting issues"
+          git commit -m "🤖 Auto-fix Python linting issues"
           git push
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -30,10 +30,11 @@ jobs:
         run: |
           cd frontend
           npm install --frozen-lockfile
-      - name: Lint
+      - name: Lint and TypeScript compilation
         run: |
           cd frontend
           npm run lint
+          npm run make-i18n && tsc
 
   # Run lint on the python code
   lint-python:

diff --git a/.github/workflows/openhands-resolver.yml b/.github/workflows/openhands-resolver.yml
@@ -16,6 +16,11 @@ on:
         type: string
         default: "main"
         description: "Target branch to pull and create PR against"
+      base_container_image:
+        required: false
+        type: string
+        default: ""
+        description: "Custom sandbox env"
     secrets:
       LLM_MODEL:
         required: true
@@ -139,6 +144,7 @@ jobs:
 
           echo "MAX_ITERATIONS=${{ inputs.max_iterations || 50 }}" >> $GITHUB_ENV
           echo "SANDBOX_ENV_GITHUB_TOKEN=${{ secrets.GITHUB_TOKEN }}" >> $GITHUB_ENV
+          echo "SANDBOX_ENV_BASE_CONTAINER_IMAGE=${{ inputs.base_container_image }}" >> $GITHUB_ENV
 
           # Set branch variables
           echo "TARGET_BRANCH=${{ inputs.target_branch }}" >> $GITHUB_ENV

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -21,14 +21,14 @@ There are many ways that you can contribute:
 
 1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
 2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
-3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
+3. **Improve the Codebase** by sending [PRs](#sending-pull-requests-to-openhands) (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
 
 ## What can I build?
 Here are a few ways you can help improve the codebase.
 
 #### UI/UX
 We're always looking to improve the look and feel of the application. If you've got a small fix
-for something that's bugging you, feel free to open up a PR that changes the `./frontend` directory.
+for something that's bugging you, feel free to open up a PR that changes the [`./frontend`](./frontend) directory.
 
 If you're looking to make a bigger change, add a new UI element, or significantly alter the style
 of the application, please open an issue first, or better, join the #frontend channel in our Slack
@@ -46,7 +46,7 @@ We use the [SWE-bench](https://www.swebench.com/) benchmark to test our agent. Y
 channel in Slack to learn more.
 
 #### Adding a new agent
-You may want to experiment with building new types of agents. You can add an agent to `openhands/agenthub`
+You may want to experiment with building new types of agents. You can add an agent to [`openhands/agenthub`](./openhands/agenthub)
 to help expand the capabilities of OpenHands.
 
 #### Adding a new runtime
@@ -57,8 +57,8 @@ If you work for a company that provides a cloud-based runtime, you could help us
 by implementing the [interface specified here](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/base.py).
 
 #### Testing
-When you write code, it is also good to write tests. Please navigate to the `tests` folder to see existing test suites.
-At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
+When you write code, it is also good to write tests. Please navigate to the [`./tests`](./tests) folder to see existing test suites.
+At the moment, we have two kinds of tests: [`unit`](./tests/unit) and [`integration`](./evaluation/integration_tests). Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
 
 ## Sending Pull Requests to OpenHands
 
@@ -103,7 +103,7 @@ Further, if you see an issue you like, please leave a "thumbs-up" or a comment,
 
 ### Making Pull Requests
 
-We're generally happy to consider all PRs, with the evaluation process varying based on the type of change:
+We're generally happy to consider all [PRs](https://github.com/All-Hands-AI/OpenHands/pulls), with the evaluation process varying based on the type of change:
 
 #### For Small Improvements
 

diff --git a/Development.md b/Development.md
@@ -100,7 +100,7 @@ poetry run pytest ./tests/unit/test_*.py
 To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
 setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
 
-Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.14-nikolaik`
+Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.15-nikolaik`
 
 ## Develop inside Docker container
 

diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@
   <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue"></a>
   <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License"></a>
   <br/>
-  <a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2tom0er4l-JeNUGHt_AxpEfIBstbLPiw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
   <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
   <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits"></a>
   <br/>
@@ -38,16 +38,16 @@ See the [Installation](https://docs.all-hands.dev/modules/usage/installation) gu
 system requirements and more information.
 
 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik
 
 docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
     -e LOG_ALL_EVENTS=true \
     -v /var/run/docker.sock:/var/run/docker.sock \
     -p 3000:3000 \
     --add-host host.docker.internal:host-gateway \
     --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14
+    docker.all-hands.dev/all-hands-ai/openhands:0.15
 ```
 
 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
@@ -82,7 +82,7 @@ troubleshooting resources, and advanced configuration options.
 OpenHands is a community-driven project, and we welcome contributions from everyone. We do most of our communication
 through Slack, so this is the best place to start, but we also are happy to have you contact us on Discord or Github:
 
-- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2tom0er4l-JeNUGHt_AxpEfIBstbLPiw) - Here we talk about research, architecture, and future development.
+- [Join our Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg) - Here we talk about research, architecture, and future development.
 - [Join our Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback.
 - [Read or post Github Issues](https://github.com/All-Hands-AI/OpenHands/issues) - Check out the issues we're working on, or add your own ideas.
 

diff --git a/compose.yml b/compose.yml
@@ -7,7 +7,7 @@ services:
     image: openhands:latest
     container_name: openhands-app-${DATE:-}
     environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
       - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
       - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
     ports:

diff --git a/config.template.toml b/config.template.toml
@@ -95,10 +95,10 @@ workspace_base = "./workspace"
 # AWS secret access key
 #aws_secret_access_key = ""
 
-# API key to use
+# API key to use (For Headless / CLI only -  In Web this is overridden by Session Init)
 api_key = "your-api-key"
 
-# API base URL
+# API base URL (For Headless / CLI only -  In Web this is overridden by Session Init)
 #base_url = ""
 
 # API version
@@ -131,7 +131,7 @@ embedding_model = "local"
 # Maximum number of output tokens
 #max_output_tokens = 0
 
-# Model to use
+# Model to use. (For Headless / CLI only -  In Web this is overridden by Session Init)
 model = "gpt-4o"
 
 # Number of retries to attempt when an operation fails with the LLM.
@@ -237,10 +237,10 @@ llm_config = 'gpt3'
 ##############################################################################
 [security]
 
-# Enable confirmation mode
+# Enable confirmation mode (For Headless / CLI only -  In Web this is overridden by Session Init)
 #confirmation_mode = false
 
-# The security analyzer to use
+# The security analyzer to use (For Headless / CLI only -  In Web this is overridden by Session Init)
 #security_analyzer = ""
 
 #################################### Eval ####################################

diff --git a/containers/dev/compose.yml b/containers/dev/compose.yml
@@ -11,7 +11,7 @@ services:
       - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
       - SANDBOX_API_HOSTNAME=host.docker.internal
       #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.14-nikolaik}
+      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.15-nikolaik}
       - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
       - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
     ports:

diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
@@ -27,7 +27,7 @@ Pour plus de détails, veuillez consulter [ce document](https://github.com/All-H
 
 Nous avons à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenHands et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, LLM, agent, etc.
 
-- [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA)
+- [Espace de travail Slack](https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg)
 - [Serveur Discord](https://discord.gg/ESHStjSjD4)
 
 Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions ensemble l'ingénierie logicielle !

diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/about.md
@@ -27,7 +27,7 @@ OpenHands 是一个社区驱动的项目，我们欢迎每个人的贡献。无
 
 我们有 Slack 工作区用于协作构建 OpenHands，也有 Discord 服务器用于讨论任何相关的内容，例如此项目、大语言模型、代理等。
 
-- [Slack 工作区](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA)
+- [Slack 工作区](https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg)
 - [Discord 服务器](https://discord.gg/ESHStjSjD4)
 
 如果你想做出贡献，欢迎加入我们的社区。让我们一起简化软件工程！

diff --git a/docs/modules/usage/how-to/cli-mode.md b/docs/modules/usage/how-to/cli-mode.md
@@ -50,7 +50,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
     --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
     -e SANDBOX_USER_ID=$(id -u) \
     -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
     -e LLM_API_KEY=$LLM_API_KEY \
@@ -59,7 +59,7 @@ docker run -it \
     -v /var/run/docker.sock:/var/run/docker.sock \
     --add-host host.docker.internal:host-gateway \
     --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.15 \
     python -m openhands.core.cli
 ```
 

diff --git a/docs/modules/usage/how-to/github-action.md b/docs/modules/usage/how-to/github-action.md
@@ -37,12 +37,15 @@ the [README for the OpenHands Resolver](https://github.com/All-Hands-AI/OpenHand
 
 You can provide custom directions for OpenHands by following the [README for the resolver](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/resolver/README.md#providing-custom-instructions).
 
-### Configure custom macro
+### Custom configurations
 
-To customize the default macro (`@openhands-agent`):
+Github resolver will automatically check for valid [repository secrets](https://docs.github.com/en/actions/security-for-github-actions/security-guides/using-secrets-in-github-actions?tool=webui#creating-secrets-for-a-repository) or [repository variables](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#creating-configuration-variables-for-a-repository) to customize its behavior. The customization options you can set are:
 
-1. [Create a repository variable](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#creating-configuration-variables-for-a-repository) named `OPENHANDS_MACRO`
-2. Assign the variable a custom value
+| **Attribute name**               | **Type** | **Purpose**                                                                                         | **Example**                                     |
+| -------------------------------- | -------- | --------------------------------------------------------------------------------------------------- | ----------------------------------------------- |
+| `OPENHANDS_MAX_ITER`             | Variable | Set max limit for agent iterations                                                                  | `OPENHANDS_MAX_ITER=10`                         |
+| `OPENHANDS_MACRO`                | Variable | Customize default macro for invoking the resolver                                                   | `OPENHANDS_MACRO=@resolveit`                    |
+| `OPENHANDS_BASE_CONTAINER_IMAGE` | Variable | Custom Sandbox ([learn more](https://docs.all-hands.dev/modules/usage/how-to/custom-sandbox-guide)) | `OPENHANDS_BASE_CONTAINER_IMAGE="custom_image"` |
 
 ## Writing Effective .openhands_instructions Files
 
@@ -55,6 +58,7 @@ The `.openhands_instructions` file is a file that you can put in the root direct
 2. **Repository Structure**: Explain the key directories and their purposes, especially highlighting where different types of code (e.g., frontend, backend) are located.
 
 3. **Development Workflows**: Document the essential commands for:
+
    - Building and setting up the project
    - Running tests
    - Linting and code quality checks
@@ -69,24 +73,29 @@ The `.openhands_instructions` file is a file that you can put in the root direct
 
 ```markdown
 # Repository Overview
+
 [Brief description of the project]
 
 ## General Setup
+
 - Main build command
 - Development environment setup
 - Pre-commit checks
 
 ## Backend
+
 - Location and structure
 - Testing instructions
 - Environment requirements
 
 ## Frontend
+
 - Setup prerequisites
 - Build and test commands
 - Environment variables
 
 ## Additional Guidelines
+
 - Code style requirements
 - Special considerations
 - Common workflows

diff --git a/docs/modules/usage/how-to/headless-mode.md b/docs/modules/usage/how-to/headless-mode.md
@@ -44,7 +44,7 @@ LLM_API_KEY="sk_test_12345"
 ```bash
 docker run -it \
     --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
     -e SANDBOX_USER_ID=$(id -u) \
     -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
     -e LLM_API_KEY=$LLM_API_KEY \
@@ -54,6 +54,6 @@ docker run -it \
     -v /var/run/docker.sock:/var/run/docker.sock \
     --add-host host.docker.internal:host-gateway \
     --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14 \
+    docker.all-hands.dev/all-hands-ai/openhands:0.15 \
     python -m openhands.core.main -t "write a bash script that prints hi"
 ```
diff --git a/docs/modules/usage/installation.mdx b/docs/modules/usage/installation.mdx
@@ -11,16 +11,16 @@
 The easiest way to run OpenHands is in Docker.
 
 ```bash
-docker pull docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik
 
 docker run -it --rm --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.14-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
     -e LOG_ALL_EVENTS=true \
     -v /var/run/docker.sock:/var/run/docker.sock \
     -p 3000:3000 \
     --add-host host.docker.internal:host-gateway \
     --name openhands-app \
-    docker.all-hands.dev/all-hands-ai/openhands:0.14
+    docker.all-hands.dev/all-hands-ai/openhands:0.15
 ```
 
 You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), or using the [OpenHands GitHub Action](https://docs.all-hands.dev/modules/usage/how-to/github-action).

diff --git a/docs/modules/usage/runtimes.md b/docs/modules/usage/runtimes.md
@@ -16,7 +16,7 @@ some flags being passed to `docker run` that make this possible:
 
 ```
 docker run # ...
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.11-nikolaik \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.15-nikolaik \
     -v /var/run/docker.sock:/var/run/docker.sock \
     # ...
 ```

diff --git a/docs/package-lock.json b/docs/package-lock.json
diff --git a/docs/package.json b/docs/package.json
@@ -24,14 +24,14 @@
     "prism-react-renderer": "^2.4.0",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
-    "react-icons": "^5.3.0",
+    "react-icons": "^5.4.0",
     "react-use": "^17.5.1"
   },
   "devDependencies": {
     "@docusaurus/module-type-aliases": "^3.5.1",
     "@docusaurus/tsconfig": "^3.6.3",
     "@docusaurus/types": "^3.5.1",
-    "typescript": "~5.6.3"
+    "typescript": "~5.7.2"
   },
   "browserslist": {
     "production": [

diff --git a/docs/src/components/CustomFooter.tsx b/docs/src/components/CustomFooter.tsx
@@ -8,7 +8,7 @@ function CustomFooter() {
     <footer className="custom-footer">
       <div className="footer-content">
         <div className="footer-icons">
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA" target="_blank" rel="noopener noreferrer">
+          <a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg" target="_blank" rel="noopener noreferrer">
             <FaSlack />
           </a>
           <a href="https://discord.gg/ESHStjSjD4" target="_blank" rel="noopener noreferrer">

diff --git a/docs/src/components/HomepageHeader/HomepageHeader.tsx b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -23,7 +23,7 @@ export function HomepageHeader() {
           <a href="https://codecov.io/github/All-Hands-AI/OpenHands?branch=main"><img alt="CodeCov" src="https://img.shields.io/codecov/c/github/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" /></a>
           <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE"><img src="https://img.shields.io/github/license/All-Hands-AI/OpenHands?style=for-the-badge&color=blue" alt="MIT License" /></a>
           <br/>
-          <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
+          <a href="https://join.slack.com/t/openhands-ai/shared_invite/zt-2vbfigwev-G03twSpXaErwzYVD4CFiBg"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community" /></a>
           <a href="https://discord.gg/ESHStjSjD4"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community" /></a>
           <a href="https://github.com/All-Hands-AI/OpenHands/blob/main/CREDITS.md"><img src="https://img.shields.io/badge/Project-Credits-blue?style=for-the-badge&color=FFE165&logo=github&logoColor=white" alt="Credits" /></a>
           <br/>

diff --git a/docs/yarn.lock b/docs/yarn.lock
diff --git a/evaluation/benchmarks/EDA/README.md b/evaluation/benchmarks/EDA/README.md
@@ -4,12 +4,10 @@ This folder contains evaluation harness for evaluating agents on the Entity-dedu
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
-
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Start the evaluation
 
-
 ```bash
 export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
 ./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
@@ -37,7 +35,8 @@ For example,
 ```
 
 ## Reference
-```
+
+```bibtex
 @inproceedings{zhang2023entity,
   title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
   author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},

diff --git a/evaluation/benchmarks/EDA/scripts/run_infer.sh b/evaluation/benchmarks/EDA/scripts/run_infer.sh
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 if [ -z "$DATASET" ]; then
   echo "Dataset not specified, use default 'things'"
@@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
   exit 1
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
@@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
   --max-iterations 20 \
   --OPENAI_API_KEY $OPENAI_API_KEY \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${DATASET}"
+  --eval-note ${OPENHANDS_VERSION}_${DATASET}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/agent_bench/README.md b/evaluation/benchmarks/agent_bench/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation harness for evaluating agents on the [AgentBench
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Start the evaluation
 

diff --git a/evaluation/benchmarks/agent_bench/scripts/run_infer.sh b/evaluation/benchmarks/agent_bench/scripts/run_infer.sh
@@ -20,18 +20,18 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/aider_bench/README.md b/evaluation/benchmarks/aider_bench/README.md
@@ -10,7 +10,7 @@ Hugging Face dataset based on the
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local
+Please follow instruction [here](../../README.md#setup) to setup your local
 development environment and LLM.
 
 ## Start the evaluation

diff --git a/evaluation/benchmarks/aider_bench/scripts/run_infer.sh b/evaluation/benchmarks/aider_bench/scripts/run_infer.sh
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE=$AGENT_VERSION
+EVAL_NOTE=$OPENHANDS_VERSION
 
 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then

diff --git a/evaluation/benchmarks/biocoder/README.md b/evaluation/benchmarks/biocoder/README.md
@@ -4,13 +4,14 @@ Implements evaluation of agents on BioCoder from the BioCoder benchmark introduc
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## BioCoder Docker Image
 
 In the openhands branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenHands environment. In the Docker image are testing scripts (`/testing/start_test_openhands.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
 
 **Before first execution, pull our Docker image with the following command**
+
 ```bash
 docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0
 ```
@@ -19,7 +20,6 @@ To reproduce this image, please see the Dockerfile_Openopenhands in the `biocode
 
 ## Start the evaluation
 
-
 ```bash
 ./evaluation/benchmarks/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
@@ -47,7 +47,8 @@ with current OpenHands version, then your command would be:
 ```
 
 ## Reference
-```
+
+```bibtex
 @misc{tang2024biocoder,
       title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
       author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},

diff --git a/evaluation/benchmarks/biocoder/scripts/run_infer.sh b/evaluation/benchmarks/biocoder/scripts/run_infer.sh
@@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
@@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${DATASET}"
+  --eval-note ${OPENHANDS_VERSION}_${DATASET}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/bird/README.md b/evaluation/benchmarks/bird/README.md
@@ -4,7 +4,7 @@ Implements evaluation of agents on BIRD introduced in [Can LLM Already Serve as
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on Bird
 
@@ -22,8 +22,7 @@ like to evaluate. It could also be a release tag like `0.6.2`.
 
 For each problem, OpenHands is given a set number of iterations to fix the failing code. The history field shows each iteration's response to correct its code that fails any test case.
 
-
-```
+```json
 {
   "task_id": "0",
   "instruction": "You are a SQL expert and need to complete the following text-to-SQL tasks.\n\nCREATE TABLE frpm\n(\n    CDSCode                                       TEXT not null\n        primary key,\n    `Academic Year`                               TEXT  null,\n    `County Code`                                 TEXT  null,\n    `District Code`                               INTEGER         null,\n    `School Code`                                 TEXT  null,\n    `County Name`                                 TEXT null,\n    `District Name`                               TEXT null,\n    `School Name`                                 TEXT null,\n    `District Type`                               TEXT null,\n    `School Type`                                 TEXT null,\n    `Educational Option Type`                     TEXT null,\n    `NSLP Provision Status`                       TEXT null,\n    `Charter School (Y/N)`                        INTEGER    null,\n    `Charter School Number`                       TEXT  null,\n    `Charter Funding Type`                        TEXT null,\n    IRC                                           INTEGER    null,\n    `Low Grade`                                   TEXT  null,\n    `High Grade`                                  TEXT null,\n    `Enrollment (K-12)`                           REAL      null,\n    `Free Meal Count (K-12)`                      REAL       null,\n    `Percent (%) Eligible Free (K-12)`            REAL       null,\n    `FRPM Count (K-12)`                           REAL       null,\n    `Percent (%) Eligible FRPM (K-12)`            REAL       null,\n    `Enrollment (Ages 5-17)`                      REAL       null,\n    `Free Meal Count (Ages 5-17)`                 REAL       null,\n    `Percent (%) Eligible Free (Ages 5-17)`       REAL       null,\n    `FRPM Count (Ages 5-17)`                      REAL       null,\n    `Percent (%) Eligible FRPM (Ages 5-17)`       REAL       null,\n    `2013-14 CALPADS Fall 1 Certification Status` INTEGER    null,\n    foreign key (CDSCode) references schools (CDSCode)\n);\n\nCREATE TABLE satscores\n(\n    cds         TEXT not null\n        primary key,\n    rtype       TEXT  not null,\n    sname       TEXT null,\n    dname       TEXT null,\n    cname       TEXT null,\n    enroll12    INTEGER         not null,\n    NumTstTakr  INTEGER          not null,\n    AvgScrRead  INTEGER          null,\n    AvgScrMath  INTEGER          null,\n    AvgScrWrite INTEGER          null,\n    NumGE1500   INTEGER          null,\n--     PctGE1500   double      null,\n        foreign key (cds) references schools (CDSCode)\n);\n\nCREATE TABLE schools\n(\n    CDSCode     TEXT not null\n        primary key,\n    NCESDist    TEXT  null,\n    NCESSchool  TEXT  null,\n    StatusType  TEXT  not null,\n    County      TEXT not null,\n    District    TEXT not null,\n    School      TEXT null,\n    Street      TEXT null,\n    StreetAbr   TEXT null,\n    City        TEXT null,\n    Zip         TEXT null,\n    State       TEXT  null,\n    MailStreet  TEXT null,\n    MailStrAbr  TEXT null,\n    MailCity    TEXT null,\n    MailZip     TEXT null,\n    MailState   TEXT  null,\n    Phone       TEXT null,\n    Ext         TEXT  null,\n    Website     TEXT null,\n    OpenDate    DATE        null,\n    ClosedDate  DATE        null,\n    Charter     INTEGER    null,\n    CharterNum  TEXT  null,\n    FundingType TEXT null,\n    DOC         TEXT  not null,\n    DOCType     TEXT not null,\n    SOC         TEXT  null,\n    SOCType     TEXT null,\n    EdOpsCode   TEXT  null,\n    EdOpsName   TEXT null,\n    EILCode     TEXT  null,\n    EILName     TEXT null,\n    GSoffered   TEXT null,\n    GSserved    TEXT  null,\n    Virtual     TEXT  null,\n    Magnet      INTEGER   null,\n    Latitude    REAL      null,\n    Longitude   REAL      null,\n    AdmFName1   TEXT null,\n    AdmLName1   TEXT null,\n    AdmEmail1   TEXT null,\n    AdmFName2   TEXT null,\n    AdmLName2   TEXT null,\n    AdmEmail2   TEXT null,\n    AdmFName3   TEXT  null,\n    AdmLName3   TEXT null,\n    AdmEmail3   TEXT null,\n    LastUpdate  DATE        not null\n);\n\n-- External Knowledge: Eligible free rate for K-12 = `Free Meal Count (K-12)` / `Enrollment (K-12)`\n\n-- Using valid SQLite and understanding External Knowledge, answer the following questions for the tables provided above.\n\n-- Using valid SQLite, answer the following questions for the tables provided above.\nQuestion: What is the highest eligible free rate for K-12 students in the schools in Alameda County?\n\n\nPlease write the SQL in one line without line breaks.And write a new python file named 0.py to call the SQL you wrote.You need to follow the code template below:\n\n\n    import sqlite3\n    def execute_sql(db_path, sql):\n        with sqlite3.connect(db_path) as conn:\n            cursor = conn.cursor()\n            cursor.execute(sql)\n            result = cursor.fetchall()\n            return result\n\n    if __name__ == '__main__':\n        sql = \"\" # filling your SQL here\n        db_path = \"california_schools/california_schools.sqlite\"\n        print(db_path)\n        result = execute_sql(db_path, sql)\n        print(result)\n    \n\nEnvironment has been set up for you to start working.You may assume all necessary tools are installed.\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\nYou SHOULD INCLUDE PROPER INDENTATION in your edit commands.\nWhen you think you have fixed the issue through code changes, please finish the interaction using the "finish" tool.\n",

diff --git a/evaluation/benchmarks/bird/scripts/run_infer.sh b/evaluation/benchmarks/bird/scripts/run_infer.sh
@@ -20,18 +20,18 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 5 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION" \
+  --eval-note $OPENHANDS_VERSION" \
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/browsing_delegation/README.md b/evaluation/benchmarks/browsing_delegation/README.md
@@ -7,7 +7,7 @@ If so, the browsing performance upper-bound of CodeActAgent will be the performa
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference
 

diff --git a/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh b/evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh
@@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 
 COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
   --agent-cls $AGENT \

diff --git a/evaluation/benchmarks/commit0_bench/README.md b/evaluation/benchmarks/commit0_bench/README.md
@@ -4,19 +4,18 @@ This folder contains the evaluation harness that we built on top of the original
 
 The evaluation consists of three steps:
 
-1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm).
+1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm).
 2. [Run Evaluation](#run-inference-on-commit0-instances): Generate a edit patch for each Commit0 Repo, and get the evaluation results
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## OpenHands Commit0 Instance-level Docker Support
 
 OpenHands supports using the Commit0 Docker for **[inference](#run-inference-on-commit0-instances).
 This is now the default behavior.
 
-
 ## Run Inference on Commit0 Instances
 
 Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the Commit0 set you are running on) for the [instance-level docker image](#openhands-commit0-instance-level-docker-support).

diff --git a/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh b/evaluation/benchmarks/commit0_bench/scripts/run_infer.sh
@@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "HF SPLIT: $SPLIT"
@@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
   export USE_HINT_TEXT=false
 fi
 echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 # if not using Hint, add -no-hint to the eval note
 if [ "$USE_HINT_TEXT" = false ]; then
   EVAL_NOTE="$EVAL_NOTE-no-hint"

diff --git a/evaluation/benchmarks/discoverybench/scripts/run_infer.sh b/evaluation/benchmarks/discoverybench/scripts/run_infer.sh
@@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
@@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
   --max-iterations 10 \
   --max-chars 10000000 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/gaia/README.md b/evaluation/benchmarks/gaia/README.md
@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the [GAIA bench
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run the evaluation
+
 We are using the GAIA dataset hosted on [Hugging Face](https://huggingface.co/datasets/gaia-benchmark/GAIA).
 Please accept the terms and make sure to have logged in on your computer by `huggingface-cli login` before running the evaluation.
 
@@ -41,6 +42,7 @@ For example,
 ## Get score
 
 Then you can get stats by running the following command:
+
 ```bash
 python ./evaluation/benchmarks/gaia/get_score.py \
 --file <path_to/output.json>

diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 if [ -z "$LEVELS" ]; then
   LEVELS="2023_level1"
   echo "Levels not specified, use default $LEVELS"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "LEVELS: $LEVELS"
 
@@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
   --level $LEVELS \
   --data-split validation \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/gorilla/README.md b/evaluation/benchmarks/gorilla/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation harness we built on top of the original [Gorilla
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on APIBench Instances
 

diff --git a/evaluation/benchmarks/gorilla/scripts/run_infer.sh b/evaluation/benchmarks/gorilla/scripts/run_infer.sh
@@ -21,15 +21,15 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 if [ -z "$HUBS" ]; then
   HUBS="hf,torch,tf"
   echo "Hubs not specified, use default $HUBS"
 fi
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "HUBS: $HUBS"
 
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
   --hubs $HUBS \
   --data-split validation \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/gpqa/README.md b/evaluation/benchmarks/gpqa/README.md
@@ -3,6 +3,7 @@
 Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124).
 
 This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting.
+
 - The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web.
 - Even experts in the corresponding domains achieve only 65% accuracy.
 - State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset.
@@ -11,20 +12,24 @@ This code implements the evaluation of agents on the GPQA Benchmark with Open Bo
 Accurate solving of above graduate level questions would require both tool use (e.g., python for calculations) and web-search for finding related facts as information required for the questions might not be part of the LLM knowledge / training data.
 
 Further references:
-- https://arxiv.org/pdf/2311.12022
-- https://paperswithcode.com/dataset/gpqa
-- https://github.com/idavidrein/gpqa
+
+- <https://arxiv.org/pdf/2311.12022>
+- <https://paperswithcode.com/dataset/gpqa>
+- <https://github.com/idavidrein/gpqa>
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on GPQA Benchmark
+
 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
 From the root of the OpenHands repo, run the following command:
+
 ```bash
 ./evaluation/benchmarks/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
 ```
+
 You can replace `model_config_name` with any model you set up in `config.toml`.
 
 - `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.

diff --git a/evaluation/benchmarks/gpqa/scripts/run_infer.sh b/evaluation/benchmarks/gpqa/scripts/run_infer.sh
@@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
   DATA_SPLIT="gpqa_diamond"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
@@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
   --data-split $DATA_SPLIT \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md
@@ -4,7 +4,7 @@ Implements evaluation of agents on HumanEvalFix from the HumanEvalPack benchmark
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on HumanEvalFix
 
@@ -14,13 +14,11 @@ Please follow instruction [here](../README.md#setup) to setup your local develop
 
 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
 
-
 ## Examples
 
 For each problem, OpenHands is given a set number of iterations to fix the failing code. The history field shows each iteration's response to correct its code that fails any test case.
 
-
-```
+```json
 {
     "task_id": "Python/2",
     "instruction": "Please fix the function in Python__2.py such that all test cases pass.\nEnvironment has been set up for you to start working. You may assume all necessary tools are installed.\n\n# Problem Statement\ndef truncate_number(number: float) -> float:\n    return number % 1.0 + 1.0\n\n\n\n\n\n\ndef check(truncate_number):\n    assert truncate_number(3.5) == 0.5\n    assert abs(truncate_number(1.33) - 0.33) < 1e-6\n    assert abs(truncate_number(123.456) - 0.456) < 1e-6\n\ncheck(truncate_number)\n\nIMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\nYou should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\nYou SHOULD INCLUDE PROPER INDENTATION in your edit commands.\nWhen you think you have fixed the issue through code changes, please finish the interaction using the "finish" tool.\n",

diff --git a/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh b/evaluation/benchmarks/humanevalfix/scripts/run_infer.sh
@@ -58,18 +58,18 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/logic_reasoning/README.md b/evaluation/benchmarks/logic_reasoning/README.md
@@ -4,9 +4,10 @@ This folder contains evaluation harness for evaluating agents on the logic reaso
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on logic_reasoning
+
 The following code will run inference on the first example of the ProofWriter dataset,
 
 ```bash

diff --git a/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh b/evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh
@@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
   DATASET="ProofWriter"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
   --dataset $DATASET \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/miniwob/README.md b/evaluation/benchmarks/miniwob/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation for [MiniWoB++](https://miniwob.farama.org/) ben
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Test if your environment works
 
@@ -42,7 +42,6 @@ poetry run python evaluation/benchmarks/miniwob/get_success_rate.py evaluation/e
 
 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
 
-
 ## BrowsingAgent V1.0 result
 
 Tested on BrowsingAgent V1.0

diff --git a/evaluation/benchmarks/miniwob/scripts/run_infer.sh b/evaluation/benchmarks/miniwob/scripts/run_infer.sh
@@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
   AGENT="BrowsingAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
+EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"
 
 COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
   --agent-cls $AGENT \

diff --git a/evaluation/benchmarks/mint/scripts/run_infer.sh b/evaluation/benchmarks/mint/scripts/run_infer.sh
@@ -18,10 +18,10 @@ checkout_eval_branch
 # Only 'CodeActAgent' is supported for MINT now
 AGENT="CodeActAgent"
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 
 export PYTHONPATH=$(pwd)
 

diff --git a/evaluation/benchmarks/ml_bench/README.md b/evaluation/benchmarks/ml_bench/README.md
@@ -12,7 +12,7 @@ For more details on the ML-Bench task and dataset, please refer to the paper: [M
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on ML-Bench
 

diff --git a/evaluation/benchmarks/ml_bench/scripts/run_infer.sh b/evaluation/benchmarks/ml_bench/scripts/run_infer.sh
@@ -26,18 +26,18 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/scienceagentbench/README.md b/evaluation/benchmarks/scienceagentbench/README.md
@@ -1,10 +1,10 @@
 # ScienceAgentBench Evaluation with OpenHands
 
-This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080).
+This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: <https://arxiv.org/abs/2410.05080>).
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Setup ScienceAgentBench
 
@@ -45,6 +45,7 @@ After the inference is completed, you may use the following command to extract n
 ```bash
 python post_proc.py [log_fname]
 ```
+
 - `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent.
 
 Output will be write to e.g. `evaluation/.../output.converted.jsonl`

diff --git a/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh b/evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh
@@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
   USE_KNOWLEDGE=false
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
@@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
   --use_knowledge $USE_KNOWLEDGE \
   --max-iterations 30 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION" \
+  --eval-note $OPENHANDS_VERSION" \
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md
@@ -6,20 +6,19 @@ This folder contains the evaluation harness that we built on top of the original
 
 The evaluation consists of three steps:
 
-1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
+1. Environment setup: [install python environment](../../README.md#development-environment), [configure LLM config](../../README.md#configure-openhands-and-your-llm), and [pull docker](#openhands-swe-bench-instance-level-docker-support).
 2. [Run inference](#run-inference-on-swe-bench-instances): Generate a edit patch for each Github issue
 3. [Evaluate patches using SWE-Bench docker](#evaluate-generated-patches)
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## OpenHands SWE-Bench Instance-level Docker Support
 
 OpenHands now support using the [official evaluation docker](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md) for both **[inference](#run-inference-on-swe-bench-instances) and [evaluation](#evaluate-generated-patches)**.
 This is now the default behavior.
 
-
 ## Run Inference on SWE-Bench Instances
 
 Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).
@@ -52,7 +51,8 @@ default, it is set to 1.
 - `dataset_split`, split for the huggingface dataset. e.g., `test`, `dev`. Default to `test`.
 
 There are also two optional environment variables you can set.
-```
+
+```bash
 export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Default to false. Ignore this if you are not sure.
 export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images. Default to true
 ```
@@ -127,6 +127,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc
 **This evaluation is performed using the official dockerized evaluation announced [here](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md).**
 
 > If you want to evaluate existing results, you should first run this to clone existing outputs
+>
 >```bash
 >git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
 >```
@@ -143,6 +144,7 @@ Then you can run the following:
 ```
 
 The script now accepts optional arguments:
+
 - `instance_id`: Specify a single instance to evaluate (optional)
 - `dataset_name`: The name of the dataset to use (default: `"princeton-nlp/SWE-bench_Lite"`)
 - `split`: The split of the dataset to use (default: `"test"`)
@@ -179,7 +181,6 @@ To clean-up all existing runtimes that you've already started, run:
 ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/benchmarks/swe_bench/scripts/cleanup_remote_runtime.sh
 ```
 
-
 ## Visualize Results
 
 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
@@ -189,6 +190,7 @@ git clone https://huggingface.co/spaces/OpenHands/evaluation
 ```
 
 **(optional) setup streamlit environment with conda**:
+
 ```bash
 cd evaluation
 conda create -n streamlit python=3.10

diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
@@ -1,41 +1,46 @@
 #!/usr/bin/env python3
 import argparse
+import glob
 import json
+import os
 from collections import Counter
 
+import pandas as pd
+
 from openhands.events.serialization import event_from_dict
 from openhands.events.utils import get_pairs_from_events
 
 ERROR_KEYWORDS = [
     'Agent encountered an error while processing the last action',
     'APIError',
     'Action execution failed',
+    'litellm.Timeout: APITimeoutError',
 ]
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('output_file', type=str, help='The file to summarize')
-    args = parser.parse_args()
 
-    with open(args.output_file, 'r') as file:
+def process_file(file_path):
+    with open(file_path, 'r') as file:
         lines = file.readlines()
 
     num_lines = len(lines)
     num_error_lines = 0
     num_agent_stuck_in_loop = 0
-
     num_resolved = 0
     num_empty_patch = 0
-
+    num_unfinished_runs = 0
     error_counter = Counter()
-
     main_agent_cost = []
     editor_cost = []
     num_turns = []
 
     for line in lines:
         _d = json.loads(line)
 
+        if 'metrics' not in _d or _d['metrics'] is None:
+            # this is a failed run
+            num_unfinished_runs += 1
+            continue
+
         # Cost
         costs = _d['metrics'].get('costs', [])
         _cur_main_agent_cost = 0
@@ -89,30 +94,180 @@
                 num_error_lines += 1
                 break
 
-    # print the error counter (with percentage)
-    print(
-        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
+    return {
+        'file_path': file_path,
+        'total_instances': num_lines,
+        'resolved': {
+            'count': num_resolved,
+            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'empty_patches': {
+            'count': num_empty_patch,
+            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'unfinished_runs': {
+            'count': num_unfinished_runs,
+            'percentage': (num_unfinished_runs / num_lines * 100)
+            if num_lines > 0
+            else 0,
+        },
+        'errors': {
+            'total': num_error_lines,
+            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
+            'stuck_in_loop': {
+                'count': num_agent_stuck_in_loop,
+                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
+                if num_lines > 0
+                else 0,
+            },
+            'breakdown': {
+                str(error): {
+                    'count': count,
+                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
+                }
+                for error, count in error_counter.items()
+            },
+        },
+        'statistics': {
+            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
+            'costs': {
+                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
+                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
+                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
+                if num_lines > 0
+                else 0,
+            },
+        },
+    }
+
+
+def aggregate_directory(input_path) -> pd.DataFrame:
+    # Process all output.jsonl files in subdirectories
+    pattern = os.path.join(input_path, '**/output.jsonl')
+    files = glob.glob(pattern, recursive=True)
+    print(f'Processing {len(files)} files from directory {input_path}')
+
+    # Process each file silently and collect results
+    results = []
+    for file_path in files:
+        try:
+            result = process_file(file_path)
+            results.append(result)
+        except Exception as e:
+            print(f'Error processing {file_path}: {str(e)}')
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    # Convert results to pandas DataFrame and sort by resolve rate
+    df = pd.DataFrame(results)
+
+    # Extract directory name from file path
+    df['directory'] = df['file_path'].apply(
+        lambda x: os.path.basename(os.path.dirname(x))
     )
-    print(
-        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
+
+    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
+    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
+    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
+    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
+    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
+    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
+
+    df = df.sort_values('resolve_rate', ascending=False)
+
+    return df
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input_path', type=str, help='The file or directory to summarize'
     )
-    assert len(num_turns) == num_lines
-    assert len(main_agent_cost) == num_lines
-    assert len(editor_cost) == num_lines
-    print('## Statistics')
-    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
-    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
-    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
-    print(
-        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='Output JSONL file for results',
+        default='summary_results.jsonl',
     )
+    args = parser.parse_args()
+
+    if os.path.isdir(args.input_path):
+        df = aggregate_directory(args.input_path)
+        # Create the summary string
+        columns = [
+            'directory',
+            'resolve_rate',
+            'empty_patch_rate',
+            'unfinished_rate',
+            'error_rate',
+            'avg_turns',
+            'avg_cost',
+            'total_instances',
+        ]
+        summary_str = df[columns].to_string(
+            float_format=lambda x: '{:.2f}'.format(x),
+            formatters={
+                'directory': lambda x: x[:90]
+            },  # Truncate directory names to 20 chars
+            index=False,
+        )
+
+        # Print to console
+        print('\nResults summary (sorted by resolve rate):')
+        print(summary_str)
+
+        # Save to text file
+        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
+        with open(txt_output, 'w') as f:
+            f.write('Results summary (sorted by resolve rate):\n')
+            f.write(summary_str)
+
+        # Save
+        df.to_json(args.output, lines=True, orient='records')
+        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
+    else:
+        # Process single file with detailed output
+        results = []
+        try:
+            result = process_file(args.input_path)
+            results.append(result)
+
+            # Print detailed results for single file
+            print(f'\nResults for {args.input_path}:')
+            print(
+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
+            )
+            print('## Statistics')
+            print(
+                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
+            )
+            print(
+                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
+            )
+            print(
+                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
+            )
+            print(
+                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
+            )
+
+            print('## Detailed error breakdown:')
+            for error, data in result['errors']['breakdown'].items():
+                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
 
-    print('## Detailed error breakdown:')
-    for error, count in error_counter.items():
-        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
+        except Exception as e:
+            print(f'Error processing {args.input_path}: {str(e)}')
diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "SPLIT: $SPLIT"
@@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
   export USE_HINT_TEXT=false
 fi
 echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 # if not using Hint, add -no-hint to the eval note
 if [ "$USE_HINT_TEXT" = false ]; then
   EVAL_NOTE="$EVAL_NOTE-no-hint"

diff --git a/evaluation/benchmarks/toolqa/README.md b/evaluation/benchmarks/toolqa/README.md
@@ -4,7 +4,7 @@ This folder contains an evaluation harness we built on top of the original [Tool
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Run Inference on ToolQA Instances
 

diff --git a/evaluation/benchmarks/toolqa/scripts/run_infer.sh b/evaluation/benchmarks/toolqa/scripts/run_infer.sh
@@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then
   echo "WOLFRAM_APPID not specified"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "HARDNESS: $HARDNESS"
@@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
   --wolfram_alpha_appid $WOLFRAM_APPID\
   --data-split validation \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/webarena/README.md b/evaluation/benchmarks/webarena/README.md
@@ -4,7 +4,7 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we
 
 ## Setup Environment and LLM Configuration
 
-Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
 
 ## Setup WebArena Environment
 

diff --git a/evaluation/benchmarks/webarena/scripts/run_infer.sh b/evaluation/benchmarks/webarena/scripts/run_infer.sh
@@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then
   AGENT="BrowsingAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 
 COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
   --agent-cls $AGENT \

diff --git a/evaluation/integration_tests/scripts/run_infer.sh b/evaluation/integration_tests/scripts/run_infer.sh
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE=$AGENT_VERSION
+EVAL_NOTE=$OPENHANDS_VERSION
 
 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then

diff --git a/evaluation/utils/version_control.sh b/evaluation/utils/version_control.sh
@@ -39,8 +39,8 @@ checkout_original_branch() {
     git checkout $current_branch
 }
 
-get_agent_version() {
+get_openhands_version() {
     # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
     # We need to track the version of Agent in the evaluation to make sure results are comparable
-    AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+    OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
 }
diff --git a/frontend/.gitignore b/frontend/.gitignore
@@ -7,3 +7,4 @@ node_modules/
 /playwright-report/
 /blob-report/
 /playwright/.cache/
+.react-router/
diff --git a/frontend/README.md b/frontend/README.md
@@ -9,6 +9,7 @@ This is the frontend of the OpenHands project. It is a React application that pr
 - Remix SPA Mode (React + Vite + React Router)
 - TypeScript
 - Redux
+- TanStack Query
 - Tailwind CSS
 - i18next
 - React Testing Library
@@ -85,7 +86,7 @@ frontend
 ├── src
 │   ├── api # API calls
 │   ├── assets
-│   ├── components # Reusable components
+│   ├── components
 │   ├── context # Local state management
 │   ├── hooks # Custom hooks
 │   ├── i18n # Internationalization
@@ -99,6 +100,18 @@ frontend
 └── .env.sample # Sample environment variables
 ```
 
+#### Components
+
+Components are organized into folders based on their **domain**, **feature**, or **shared functionality**.
+
+```sh
+components
+├── features # Domain-specific components
+├── layout
+├── modals
+└── ui # Shared UI components
+```
+
 ### Features
 
 - Real-time updates with WebSockets

diff --git a/frontend/__tests__/components/browser.test.tsx b/frontend/__tests__/components/browser.test.tsx
@@ -1,7 +1,8 @@
 import { screen } from "@testing-library/react";
 import { describe, it, expect } from "vitest";
 import { renderWithProviders } from "../../test-utils";
-import BrowserPanel from "#/components/browser";
+import { BrowserPanel } from "#/components/features/browser/browser";
+
 
 describe("Browser", () => {
   it("renders a message if no screenshotSrc is provided", () => {

diff --git a/frontend/__tests__/components/chat-message.test.tsx b/frontend/__tests__/components/chat-message.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { describe, it, expect, test } from "vitest";
-import { ChatMessage } from "#/components/chat-message";
+import { ChatMessage } from "#/components/features/chat/chat-message";
 
 describe("ChatMessage", () => {
   it("should render a user message", () => {

diff --git a/frontend/__tests__/components/chat/chat-input.test.tsx b/frontend/__tests__/components/chat/chat-input.test.tsx
@@ -1,7 +1,7 @@
 import userEvent from "@testing-library/user-event";
 import { fireEvent, render, screen } from "@testing-library/react";
 import { describe, afterEach, vi, it, expect } from "vitest";
-import { ChatInput } from "#/components/chat-input";
+import { ChatInput } from "#/components/features/chat/chat-input";
 
 describe("ChatInput", () => {
   const onSubmitMock = vi.fn();

diff --git a/frontend/__tests__/components/chat/chat-interface.test.tsx b/frontend/__tests__/components/chat/chat-interface.test.tsx
@@ -6,10 +6,10 @@ import { addUserMessage } from "#/state/chat-slice";
 import { SUGGESTIONS } from "#/utils/suggestions";
 import * as ChatSlice from "#/state/chat-slice";
 import { WsClientProviderStatus } from "#/context/ws-client-provider";
-import { ChatInterface } from "#/routes/_oh.app/chat-interface";
+import { ChatInterface } from "#/components/features/chat/chat-interface";
 
 // eslint-disable-next-line @typescript-eslint/no-unused-vars
-const renderChatInterface = (messages: (Message | ErrorMessage)[]) =>
+const renderChatInterface = (messages: (Message)[]) =>
   renderWithProviders(<ChatInterface />);
 
 describe("Empty state", () => {
@@ -26,8 +26,8 @@ describe("Empty state", () => {
   }));
 
   beforeAll(() => {
-    vi.mock("@remix-run/react", async (importActual) => ({
-      ...(await importActual<typeof import("@remix-run/react")>()),
+    vi.mock("react-router", async (importActual) => ({
+      ...(await importActual<typeof import("react-router")>()),
       useRouteLoaderData: vi.fn(() => ({})),
     }));
 
@@ -56,6 +56,7 @@ describe("Empty state", () => {
           content: "Hello",
           imageUrls: [],
           timestamp: new Date().toISOString(),
+          pending: true,
         }),
       );
     });
@@ -172,12 +173,14 @@ describe.skip("ChatInterface", () => {
         content: "Hello",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
       {
         sender: "assistant",
         content: "Hi",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
     ];
     renderChatInterface(messages);
@@ -211,6 +214,7 @@ describe.skip("ChatInterface", () => {
         content: "Here are some images",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
     ];
     const { rerender } = renderChatInterface(messages);
@@ -223,6 +227,7 @@ describe.skip("ChatInterface", () => {
         content: "Here are some images",
         imageUrls: ["image1", "image2"],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
     ];
 
@@ -244,12 +249,14 @@ describe.skip("ChatInterface", () => {
         content: "Hello",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
       {
         sender: "user",
         content: "Hi",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
     ];
     const { rerender } = renderChatInterface(messages);
@@ -262,6 +269,7 @@ describe.skip("ChatInterface", () => {
       content: "How can I help you?",
       imageUrls: [],
       timestamp: new Date().toISOString(),
+      pending: true,
     });
 
     rerender(<ChatInterface />);
@@ -270,17 +278,19 @@ describe.skip("ChatInterface", () => {
   });
 
   it("should render inline errors", () => {
-    const messages: (Message | ErrorMessage)[] = [
+    const messages: (Message)[] = [
       {
         sender: "assistant",
         content: "Hello",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
       {
-        error: true,
-        id: "",
-        message: "Something went wrong",
+        type: "error",
+        content: "Something went wrong",
+        sender: "assistant",
+        timestamp: new Date().toISOString(),
       },
     ];
     renderChatInterface(messages);
@@ -290,8 +300,8 @@ describe.skip("ChatInterface", () => {
   });
 
   it("should render both GitHub buttons initially when ghToken is available", () => {
-    vi.mock("@remix-run/react", async (importActual) => ({
-      ...(await importActual<typeof import("@remix-run/react")>()),
+    vi.mock("react-router", async (importActual) => ({
+      ...(await importActual<typeof import("react-router")>()),
       useRouteLoaderData: vi.fn(() => ({ ghToken: "test-token" })),
     }));
 
@@ -301,6 +311,7 @@ describe.skip("ChatInterface", () => {
         content: "Hello",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
     ];
     renderChatInterface(messages);
@@ -315,8 +326,8 @@ describe.skip("ChatInterface", () => {
   });
 
   it("should render only 'Push changes to PR' button after PR is created", async () => {
-    vi.mock("@remix-run/react", async (importActual) => ({
-      ...(await importActual<typeof import("@remix-run/react")>()),
+    vi.mock("react-router", async (importActual) => ({
+      ...(await importActual<typeof import("react-router")>()),
       useRouteLoaderData: vi.fn(() => ({ ghToken: "test-token" })),
     }));
 
@@ -326,6 +337,7 @@ describe.skip("ChatInterface", () => {
         content: "Hello",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
     ];
     const { rerender } = renderChatInterface(messages);
@@ -358,18 +370,21 @@ describe.skip("ChatInterface", () => {
         content: "Hello",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
       {
         sender: "user",
         content: "Hi",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
       {
         sender: "assistant",
         content: "How can I help you?",
         imageUrls: [],
         timestamp: new Date().toISOString(),
+        pending: true,
       },
     ];
     const { rerender } = renderChatInterface(messages);
@@ -380,6 +395,7 @@ describe.skip("ChatInterface", () => {
       content: "I need help",
       imageUrls: [],
       timestamp: new Date().toISOString(),
+      pending: true,
     });
 
     rerender(<ChatInterface />);

diff --git a/frontend/__tests__/components/context-menu/account-settings-context-menu.test.tsx b/frontend/__tests__/components/context-menu/account-settings-context-menu.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, test, vi } from "vitest";
-import { AccountSettingsContextMenu } from "#/components/context-menu/account-settings-context-menu";
+import { AccountSettingsContextMenu } from "#/components/features/context-menu/account-settings-context-menu";
 
 describe("AccountSettingsContextMenu", () => {
   const user = userEvent.setup();

diff --git a/frontend/__tests__/components/context-menu/context-menu-list-item.test.tsx b/frontend/__tests__/components/context-menu/context-menu-list-item.test.tsx
@@ -1,7 +1,7 @@
 import { describe, it, expect, vi } from "vitest";
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
-import { ContextMenuListItem } from "#/components/context-menu/context-menu-list-item";
+import { ContextMenuListItem } from "#/components/features/context-menu/context-menu-list-item";
 
 describe("ContextMenuListItem", () => {
   it("should render the component with the children", () => {

diff --git a/frontend/__tests__/components/features/waitlist-modal.test.tsx b/frontend/__tests__/components/features/waitlist-modal.test.tsx
@@ -0,0 +1,45 @@
+import { render, screen } from "@testing-library/react";
+import { it, describe, expect, vi } from "vitest";
+import userEvent from "@testing-library/user-event";
+import { WaitlistModal } from "#/components/features/waitlist/waitlist-modal";
+import * as CaptureConsent from "#/utils/handle-capture-consent";
+
+describe("WaitlistModal", () => {
+  it("should render a tos checkbox that is unchecked by default", () => {
+    render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
+    const checkbox = screen.getByRole("checkbox");
+
+    expect(checkbox).not.toBeChecked();
+  });
+
+  it("should only enable the GitHub button if the tos checkbox is checked", async () => {
+    const user = userEvent.setup();
+    render(<WaitlistModal ghToken={null} githubAuthUrl={null} />);
+    const checkbox = screen.getByRole("checkbox");
+    const button = screen.getByRole("button", { name: "Connect to GitHub" });
+
+    expect(button).toBeDisabled();
+
+    await user.click(checkbox);
+
+    expect(button).not.toBeDisabled();
+  });
+
+  it("should set user analytics consent to true when the user checks the tos checkbox", async () => {
+    const handleCaptureConsentSpy = vi.spyOn(
+      CaptureConsent,
+      "handleCaptureConsent",
+    );
+
+    const user = userEvent.setup();
+    render(<WaitlistModal ghToken={null} githubAuthUrl="mock-url" />);
+
+    const checkbox = screen.getByRole("checkbox");
+    await user.click(checkbox);
+
+    const button = screen.getByRole("button", { name: "Connect to GitHub" });
+    await user.click(button);
+
+    expect(handleCaptureConsentSpy).toHaveBeenCalledWith(true);
+  });
+});
diff --git a/frontend/__tests__/components/feedback-actions.test.tsx b/frontend/__tests__/components/feedback-actions.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen, within } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, vi } from "vitest";
-import { FeedbackActions } from "#/components/feedback-actions";
+import { FeedbackActions } from "#/components/features/feedback/feedback-actions";
 
 describe("FeedbackActions", () => {
   const user = userEvent.setup();

diff --git a/frontend/__tests__/components/feedback-form.test.tsx b/frontend/__tests__/components/feedback-form.test.tsx
@@ -1,8 +1,8 @@
-import { render, screen } from "@testing-library/react";
+import { screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, vi } from "vitest";
 import { renderWithProviders } from "test-utils";
-import { FeedbackForm } from "#/components/feedback-form";
+import { FeedbackForm } from "#/components/features/feedback/feedback-form";
 
 describe("FeedbackForm", () => {
   const user = userEvent.setup();

diff --git a/frontend/__tests__/components/file-explorer/explorer-tree.test.tsx b/frontend/__tests__/components/file-explorer/explorer-tree.test.tsx
@@ -1,7 +1,7 @@
 import { screen } from "@testing-library/react";
 import { renderWithProviders } from "test-utils";
 import { describe, afterEach, vi, it, expect } from "vitest";
-import ExplorerTree from "#/components/file-explorer/explorer-tree";
+import { ExplorerTree } from "#/components/features/file-explorer/explorer-tree";
 
 const FILES = ["file-1-1.ts", "folder-1-2"];
 

diff --git a/frontend/__tests__/components/file-explorer/file-explorer.test.tsx b/frontend/__tests__/components/file-explorer/file-explorer.test.tsx
@@ -4,8 +4,8 @@ import { renderWithProviders } from "test-utils";
 import { describe, it, expect, vi, Mock, afterEach } from "vitest";
 import toast from "#/utils/toast";
 import AgentState from "#/types/agent-state";
-import { FileExplorer } from "#/routes/_oh.app._index/file-explorer/file-explorer";
 import OpenHands from "#/api/open-hands";
+import { FileExplorer } from "#/components/features/file-explorer/file-explorer";
 
 const toastSpy = vi.spyOn(toast, "error");
 const uploadFilesSpy = vi.spyOn(OpenHands, "uploadFiles");

diff --git a/frontend/__tests__/components/file-explorer/tree-node.test.tsx b/frontend/__tests__/components/file-explorer/tree-node.test.tsx
@@ -2,7 +2,7 @@ import { screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { renderWithProviders } from "test-utils";
 import { vi, describe, afterEach, it, expect } from "vitest";
-import TreeNode from "#/components/file-explorer/tree-node";
+import TreeNode from "#/components/features/file-explorer/tree-node";
 import OpenHands from "#/api/open-hands";
 
 const getFileSpy = vi.spyOn(OpenHands, "getFile");

diff --git a/frontend/__tests__/components/image-preview.test.tsx b/frontend/__tests__/components/image-preview.test.tsx
@@ -1,7 +1,7 @@
+import { ImagePreview } from "#/components/features/images/image-preview";
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { describe, expect, it, vi } from "vitest";
-import { ImagePreview } from "#/components/image-preview";
 
 describe("ImagePreview", () => {
   it("should render an image", () => {

diff --git a/frontend/__tests__/components/interactive-chat-box.test.tsx b/frontend/__tests__/components/interactive-chat-box.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen, within } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
-import { InteractiveChatBox } from "#/components/interactive-chat-box";
+import { InteractiveChatBox } from "#/components/features/chat/interactive-chat-box";
 
 describe("InteractiveChatBox", () => {
   const onSubmitMock = vi.fn();

diff --git a/frontend/__tests__/components/modals/base-modal/base-modal.test.tsx b/frontend/__tests__/components/modals/base-modal/base-modal.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen, act } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { describe, it, vi, expect } from "vitest";
-import BaseModal from "#/components/modals/base-modal/base-modal";
+import { BaseModal } from "#/components/shared/modals/base-modal/base-modal";
 
 describe("BaseModal", () => {
   it("should render if the modal is open", () => {

diff --git a/frontend/__tests__/components/modals/settings/model-selector.test.tsx b/frontend/__tests__/components/modals/settings/model-selector.test.tsx
@@ -1,7 +1,7 @@
 import { describe, it, expect } from "vitest";
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
-import { ModelSelector } from "#/components/modals/settings/model-selector";
+import { ModelSelector } from "#/components/shared/modals/settings/model-selector";
 
 describe("ModelSelector", () => {
   const models = {

diff --git a/frontend/__tests__/components/suggestion-item.test.tsx b/frontend/__tests__/components/suggestion-item.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, vi } from "vitest";
-import { SuggestionItem } from "#/components/suggestion-item";
+import { SuggestionItem } from "#/components/features/suggestions/suggestion-item";
 
 describe("SuggestionItem", () => {
   const suggestionItem = { label: "suggestion1", value: "a long text value" };

diff --git a/frontend/__tests__/components/suggestions.test.tsx b/frontend/__tests__/components/suggestions.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, vi } from "vitest";
-import { Suggestions } from "#/components/suggestions";
+import { Suggestions } from "#/components/features/suggestions/suggestions";
 
 describe("Suggestions", () => {
   const firstSuggestion = {

diff --git a/frontend/__tests__/components/terminal/terminal.test.tsx b/frontend/__tests__/components/terminal/terminal.test.tsx
@@ -2,7 +2,7 @@ import { act, screen } from "@testing-library/react";
 import { renderWithProviders } from "test-utils";
 import { vi, describe, afterEach, it, expect } from "vitest";
 import { Command, appendInput, appendOutput } from "#/state/command-slice";
-import Terminal from "#/components/terminal/terminal";
+import Terminal from "#/components/features/terminal/terminal";
 
 global.ResizeObserver = vi.fn().mockImplementation(() => ({
   observe: vi.fn(),

diff --git a/frontend/__tests__/components/upload-image-input.test.tsx b/frontend/__tests__/components/upload-image-input.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, vi } from "vitest";
-import { UploadImageInput } from "#/components/upload-image-input";
+import { UploadImageInput } from "#/components/features/images/upload-image-input";
 
 describe("UploadImageInput", () => {
   const user = userEvent.setup();

diff --git a/frontend/__tests__/components/user-actions.test.tsx b/frontend/__tests__/components/user-actions.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen } from "@testing-library/react";
 import { describe, expect, it, test, vi, afterEach } from "vitest";
 import userEvent from "@testing-library/user-event";
-import { UserActions } from "#/components/user-actions";
+import { UserActions } from "#/components/features/sidebar/user-actions";
 
 describe("UserActions", () => {
   const user = userEvent.setup();

diff --git a/frontend/__tests__/components/user-avatar.test.tsx b/frontend/__tests__/components/user-avatar.test.tsx
@@ -1,7 +1,7 @@
 import { render, screen } from "@testing-library/react";
 import userEvent from "@testing-library/user-event";
 import { afterEach, describe, expect, it, vi } from "vitest";
-import { UserAvatar } from "#/components/user-avatar";
+import { UserAvatar } from "#/components/features/sidebar/user-avatar";
 
 describe("UserAvatar", () => {
   const onClickMock = vi.fn();

diff --git a/frontend/__tests__/hooks/use-rate.test.ts b/frontend/__tests__/hooks/use-rate.test.ts
@@ -1,6 +1,6 @@
 import { act, renderHook } from "@testing-library/react";
 import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
-import { useRate } from "#/utils/use-rate";
+import { useRate } from "#/hooks/use-rate";
 
 describe("useRate", () => {
   beforeEach(() => {

diff --git a/frontend/__tests__/routes/_oh.test.tsx b/frontend/__tests__/routes/_oh.test.tsx
@@ -1,14 +1,15 @@
 import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
-import { createRemixStub } from "@remix-run/testing";
+import { createRoutesStub } from "react-router";
 import { screen, waitFor, within } from "@testing-library/react";
 import { renderWithProviders } from "test-utils";
 import userEvent from "@testing-library/user-event";
 import MainApp from "#/routes/_oh/route";
-import * as CaptureConsent from "#/utils/handle-capture-consent";
 import i18n from "#/i18n";
+import * as CaptureConsent from "#/utils/handle-capture-consent";
+import OpenHands from "#/api/open-hands";
 
 describe("frontend/routes/_oh", () => {
-  const RemixStub = createRemixStub([{ Component: MainApp, path: "/" }]);
+  const RouteStub = createRoutesStub([{ Component: MainApp, path: "/" }]);
 
   const { userIsAuthenticatedMock, settingsAreUpToDateMock } = vi.hoisted(
     () => ({
@@ -34,40 +35,47 @@ describe("frontend/routes/_oh", () => {
   });
 
   it("should render", async () => {
-    renderWithProviders(<RemixStub />);
+    renderWithProviders(<RouteStub />);
     await screen.findByTestId("root-layout");
   });
 
   it("should render the AI config modal if the user is authed", async () => {
     // Our mock return value is true by default
-    renderWithProviders(<RemixStub />);
+    renderWithProviders(<RouteStub />);
     await screen.findByTestId("ai-config-modal");
   });
 
   it("should render the AI config modal if settings are not up-to-date", async () => {
     settingsAreUpToDateMock.mockReturnValue(false);
-    renderWithProviders(<RemixStub />);
+    renderWithProviders(<RouteStub />);
 
     await screen.findByTestId("ai-config-modal");
   });
 
   it("should not render the AI config modal if the settings are up-to-date", async () => {
     settingsAreUpToDateMock.mockReturnValue(true);
-    renderWithProviders(<RemixStub />);
+    renderWithProviders(<RouteStub />);
 
     await waitFor(() => {
       expect(screen.queryByTestId("ai-config-modal")).not.toBeInTheDocument();
     });
   });
 
-  it("should capture the user's consent", async () => {
+  it("should render and capture the user's consent if oss mode", async () => {
     const user = userEvent.setup();
+    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
     const handleCaptureConsentSpy = vi.spyOn(
       CaptureConsent,
       "handleCaptureConsent",
     );
 
-    renderWithProviders(<RemixStub />);
+    getConfigSpy.mockResolvedValue({
+      APP_MODE: "oss",
+      GITHUB_CLIENT_ID: "test-id",
+      POSTHOG_CLIENT_KEY: "test-key",
+    });
+
+    renderWithProviders(<RouteStub />);
 
     // The user has not consented to tracking
     const consentForm = await screen.findByTestId("user-capture-consent-form");
@@ -87,9 +95,26 @@ describe("frontend/routes/_oh", () => {
     ).not.toBeInTheDocument();
   });
 
+  it("should not render the user consent form if saas mode", async () => {
+    const getConfigSpy = vi.spyOn(OpenHands, "getConfig");
+    getConfigSpy.mockResolvedValue({
+      APP_MODE: "saas",
+      GITHUB_CLIENT_ID: "test-id",
+      POSTHOG_CLIENT_KEY: "test-key",
+    });
+
+    renderWithProviders(<RouteStub />);
+
+    await waitFor(() => {
+      expect(
+        screen.queryByTestId("user-capture-consent-form"),
+      ).not.toBeInTheDocument();
+    });
+  });
+
   it("should not render the user consent form if the user has already made a decision", async () => {
     localStorage.setItem("analytics-consent", "true");
-    renderWithProviders(<RemixStub />);
+    renderWithProviders(<RouteStub />);
 
     await waitFor(() => {
       expect(
@@ -98,14 +123,15 @@ describe("frontend/routes/_oh", () => {
     });
   });
 
-  it("should render a new project button if a token is set", async () => {
+  // TODO: Likely failing due to how tokens are now handled in context. Move to e2e tests
+  it.skip("should render a new project button if a token is set", async () => {
     localStorage.setItem("token", "test-token");
-    const { rerender } = renderWithProviders(<RemixStub />);
+    const { rerender } = renderWithProviders(<RouteStub />);
 
     await screen.findByTestId("new-project-button");
 
     localStorage.removeItem("token");
-    rerender(<RemixStub />);
+    rerender(<RouteStub />);
 
     await waitFor(() => {
       expect(
@@ -117,17 +143,17 @@ describe("frontend/routes/_oh", () => {
   // TODO: Move to e2e tests
   it.skip("should update the i18n language when the language settings change", async () => {
     const changeLanguageSpy = vi.spyOn(i18n, "changeLanguage");
-    const { rerender } = renderWithProviders(<RemixStub />);
+    const { rerender } = renderWithProviders(<RouteStub />);
 
     // The default language is English
     expect(changeLanguageSpy).toHaveBeenCalledWith("en");
 
     localStorage.setItem("LANGUAGE", "es");
 
-    rerender(<RemixStub />);
+    rerender(<RouteStub />);
     expect(changeLanguageSpy).toHaveBeenCalledWith("es");
 
-    rerender(<RemixStub />);
+    rerender(<RouteStub />);
     // The language has not changed, so the spy should not have been called again
     expect(changeLanguageSpy).toHaveBeenCalledTimes(2);
   });
@@ -138,7 +164,7 @@ describe("frontend/routes/_oh", () => {
     localStorage.setItem("ghToken", "test-token");
 
     // const logoutCleanupSpy = vi.spyOn(LogoutCleanup, "logoutCleanup");
-    renderWithProviders(<RemixStub />);
+    renderWithProviders(<RouteStub />);
 
     const userActions = await screen.findByTestId("user-actions");
     const userAvatar = within(userActions).getByTestId("user-avatar");

diff --git a/frontend/package-lock.json b/frontend/package-lock.json
diff --git a/frontend/package.json b/frontend/package.json
@@ -1,6 +1,6 @@
 {
   "name": "openhands-frontend",
-  "version": "0.14.3",
+  "version": "0.15.0",
   "private": true,
   "type": "module",
   "engines": {
@@ -9,15 +9,15 @@
   "dependencies": {
     "@monaco-editor/react": "^4.6.0",
     "@nextui-org/react": "^2.4.8",
+    "@react-router/node": "^7.0.1",
+    "@react-router/serve": "^7.0.1",
     "@react-types/shared": "^3.25.0",
     "@reduxjs/toolkit": "^2.3.0",
-    "@remix-run/node": "^2.11.2",
-    "@remix-run/react": "^2.11.2",
-    "@remix-run/serve": "^2.11.2",
     "@tanstack/react-query": "^5.60.5",
     "@vitejs/plugin-react": "^4.3.2",
     "@xterm/addon-fit": "^0.10.0",
     "@xterm/xterm": "^5.4.0",
+    "axios": "^1.7.7",
     "clsx": "^2.1.1",
     "eslint-config-airbnb-typescript": "^18.0.0",
     "i18next": "^23.15.2",
@@ -35,7 +35,7 @@
     "react-icons": "^5.3.0",
     "react-markdown": "^9.0.1",
     "react-redux": "^9.1.2",
-    "react-router-dom": "^6.26.1",
+    "react-router": "^7.0.1",
     "react-syntax-highlighter": "^15.6.1",
     "react-textarea-autosize": "^8.5.4",
     "remark-gfm": "^4.0.0",
@@ -47,9 +47,9 @@
     "ws": "^8.18.0"
   },
   "scripts": {
-    "dev": "npm run make-i18n && cross-env VITE_MOCK_API=false remix vite:dev",
-    "dev:mock": "npm run make-i18n && cross-env VITE_MOCK_API=true remix vite:dev",
-    "build": "npm run make-i18n && tsc && remix vite:build",
+    "dev": "npm run make-i18n && cross-env VITE_MOCK_API=false react-router dev",
+    "dev:mock": "npm run make-i18n && cross-env VITE_MOCK_API=true react-router dev",
+    "build": "npm run make-i18n && tsc && react-router build",
     "start": "npx sirv-cli build/ --single",
     "test": "vitest run",
     "test:e2e": "playwright test",
@@ -60,7 +60,8 @@
     "prelint": "npm run make-i18n",
     "lint": "eslint src --ext .ts,.tsx,.js && prettier --check src/**/*.{ts,tsx}",
     "lint:fix": "eslint src --ext .ts,.tsx,.js --fix && prettier --write src/**/*.{ts,tsx}",
-    "prepare": "cd .. && husky frontend/.husky"
+    "prepare": "cd .. && husky frontend/.husky",
+    "typecheck": "react-router typegen && tsc"
   },
   "husky": {
     "hooks": {
@@ -75,8 +76,7 @@
   },
   "devDependencies": {
     "@playwright/test": "^1.48.2",
-    "@remix-run/dev": "^2.11.2",
-    "@remix-run/testing": "^2.11.2",
+    "@react-router/dev": "^7.0.1",
     "@tailwindcss/typography": "^0.5.15",
     "@tanstack/eslint-plugin-query": "^5.60.1",
     "@testing-library/jest-dom": "^6.6.1",
@@ -105,7 +105,7 @@
     "husky": "^9.1.6",
     "jsdom": "^25.0.1",
     "lint-staged": "^15.2.10",
-    "msw": "^2.3.0-ws.rc-12",
+    "msw": "^2.6.6",
     "postcss": "^8.4.47",
     "prettier": "^3.3.3",
     "tailwindcss": "^3.4.14",

diff --git a/frontend/react-router.config.ts b/frontend/react-router.config.ts
@@ -0,0 +1,35 @@
+import type { Config } from "@react-router/dev/config";
+
+/**
+ * This script is used to unpack the client directory from the frontend build directory.
+ * Remix SPA mode builds the client directory into the build directory. This function
+ * moves the contents of the client directory to the build directory and then removes the
+ * client directory.
+ *
+ * This script is used in the buildEnd function of the Vite config.
+ */
+const unpackClientDirectory = async () => {
+  const fs = await import("fs");
+  const path = await import("path");
+
+  const buildDir = path.resolve(__dirname, "build");
+  const clientDir = path.resolve(buildDir, "client");
+
+  const files = await fs.promises.readdir(clientDir);
+  await Promise.all(
+    files.map((file) =>
+      fs.promises.rename(
+        path.resolve(clientDir, file),
+        path.resolve(buildDir, file),
+      ),
+    ),
+  );
+
+  await fs.promises.rmdir(clientDir);
+};
+
+export default {
+  appDirectory: "src",
+  buildEnd: unpackClientDirectory,
+  ssr: false,
+} satisfies Config;
diff --git a/frontend/src/api/github-axios-instance.ts b/frontend/src/api/github-axios-instance.ts
@@ -0,0 +1,21 @@
+import axios from "axios";
+
+const github = axios.create({
+  baseURL: "https://api.github.com",
+  headers: {
+    Accept: "application/vnd.github+json",
+    "X-GitHub-Api-Version": "2022-11-28",
+  },
+});
+
+const setAuthTokenHeader = (token: string) => {
+  github.defaults.headers.common.Authorization = `Bearer ${token}`;
+};
+
+const removeAuthTokenHeader = () => {
+  if (github.defaults.headers.common.Authorization) {
+    delete github.defaults.headers.common.Authorization;
+  }
+};
+
+export { github, setAuthTokenHeader, removeAuthTokenHeader };
diff --git a/frontend/src/api/github.ts b/frontend/src/api/github.ts
@@ -1,14 +1,5 @@
-/**
- * Generates the headers for the GitHub API
- * @param token The GitHub token
- * @returns The headers for the GitHub API
- */
-const generateGitHubAPIHeaders = (token: string) =>
-  ({
-    Accept: "application/vnd.github+json",
-    Authorization: `Bearer ${token}`,
-    "X-GitHub-Api-Version": "2022-11-28",
-  }) as const;
+import { extractNextPageFromLink } from "#/utils/extract-next-page-from-link";
+import { github } from "./github-axios-instance";
 
 /**
  * Checks if the data is a GitHub error response
@@ -26,74 +17,86 @@ export const isGitHubErrorReponse = <T extends object | Array<unknown>>(
  * @returns A list of repositories or an error response
  */
 export const retrieveGitHubUserRepositories = async (
-  token: string,
   page = 1,
   per_page = 30,
-): Promise<Response> => {
-  const url = new URL("https://api.github.com/user/repos");
-  url.searchParams.append("sort", "pushed"); // sort by most recently pushed
-  url.searchParams.append("page", page.toString());
-  url.searchParams.append("per_page", per_page.toString());
-
-  return fetch(url.toString(), {
-    headers: generateGitHubAPIHeaders(token),
+) => {
+  const response = await github.get<GitHubRepository[]>("/user/repos", {
+    params: {
+      sort: "pushed",
+      page,
+      per_page,
+    },
+    transformResponse: (data) => {
+      const parsedData: GitHubRepository[] | GitHubErrorReponse =
+        JSON.parse(data);
+
+      if (isGitHubErrorReponse(parsedData)) {
+        throw new Error(parsedData.message);
+      }
+
+      return parsedData;
+    },
   });
+
+  const link = response.headers.link ?? "";
+  const nextPage = extractNextPageFromLink(link);
+
+  return { data: response.data, nextPage };
 };
 
 /**
  * Given a GitHub token, retrieves the authenticated user
  * @param token The GitHub token
  * @returns The authenticated user or an error response
  */
-export const retrieveGitHubUser = async (
-  token: string,
-): Promise<GitHubUser | GitHubErrorReponse> => {
-  const response = await fetch("https://api.github.com/user", {
-    headers: generateGitHubAPIHeaders(token),
+export const retrieveGitHubUser = async () => {
+  const response = await github.get<GitHubUser>("/user", {
+    transformResponse: (data) => {
+      const parsedData: GitHubUser | GitHubErrorReponse = JSON.parse(data);
+
+      if (isGitHubErrorReponse(parsedData)) {
+        throw new Error(parsedData.message);
+      }
+
+      return parsedData;
+    },
   });
 
-  if (!response.ok) {
-    throw new Error("Failed to retrieve user data");
-  }
-
-  const data = await response.json();
-
-  if (!isGitHubErrorReponse(data)) {
-    // Only return the necessary user data
-    const user: GitHubUser = {
-      id: data.id,
-      login: data.login,
-      avatar_url: data.avatar_url,
-      company: data.company,
-      name: data.name,
-      email: data.email,
-    };
-
-    return user;
-  }
-
-  const error: GitHubErrorReponse = {
-    message: data.message,
-    documentation_url: data.documentation_url,
-    status: response.status,
+  const { data } = response;
+
+  const user: GitHubUser = {
+    id: data.id,
+    login: data.login,
+    avatar_url: data.avatar_url,
+    company: data.company,
+    name: data.name,
+    email: data.email,
   };
 
-  return error;
+  return user;
 };
 
 export const retrieveLatestGitHubCommit = async (
-  token: string,
   repository: string,
-): Promise<GitHubCommit[] | GitHubErrorReponse> => {
-  const url = new URL(`https://api.github.com/repos/${repository}/commits`);
-  url.searchParams.append("per_page", "1");
-  const response = await fetch(url.toString(), {
-    headers: generateGitHubAPIHeaders(token),
-  });
+): Promise<GitHubCommit> => {
+  const response = await github.get<GitHubCommit>(
+    `/repos/${repository}/commits`,
+    {
+      params: {
+        per_page: 1,
+      },
+      transformResponse: (data) => {
+        const parsedData: GitHubCommit[] | GitHubErrorReponse =
+          JSON.parse(data);
+
+        if (isGitHubErrorReponse(parsedData)) {
+          throw new Error(parsedData.message);
+        }
 
-  if (!response.ok) {
-    throw new Error("Failed to retrieve latest commit");
-  }
+        return parsedData[0];
+      },
+    },
+  );
 
-  return response.json();
+  return response.data;
 };
diff --git a/frontend/src/api/invariant-service.ts b/frontend/src/api/invariant-service.ts
@@ -0,0 +1,30 @@
+import { openHands } from "./open-hands-axios";
+
+class InvariantService {
+  static async getPolicy() {
+    const { data } = await openHands.get("/api/security/policy");
+    return data.policy;
+  }
+
+  static async getRiskSeverity() {
+    const { data } = await openHands.get("/api/security/settings");
+    return data.RISK_SEVERITY;
+  }
+
+  static async getTraces() {
+    const { data } = await openHands.get("/api/security/export-trace");
+    return data;
+  }
+
+  static async updatePolicy(policy: string) {
+    await openHands.post("/api/security/policy", { policy });
+  }
+
+  static async updateRiskSeverity(riskSeverity: number) {
+    await openHands.post("/api/security/settings", {
+      RISK_SEVERITY: riskSeverity,
+    });
+  }
+}
+
+export default InvariantService;
diff --git a/frontend/src/api/open-hands-axios.ts b/frontend/src/api/open-hands-axios.ts
@@ -0,0 +1,23 @@
+import axios from "axios";
+
+export const openHands = axios.create();
+
+export const setAuthTokenHeader = (token: string) => {
+  openHands.defaults.headers.common.Authorization = `Bearer ${token}`;
+};
+
+export const setGitHubTokenHeader = (token: string) => {
+  openHands.defaults.headers.common["X-GitHub-Token"] = token;
+};
+
+export const removeAuthTokenHeader = () => {
+  if (openHands.defaults.headers.common.Authorization) {
+    delete openHands.defaults.headers.common.Authorization;
+  }
+};
+
+export const removeGitHubTokenHeader = () => {
+  if (openHands.defaults.headers.common["X-GitHub-Token"]) {
+    delete openHands.defaults.headers.common["X-GitHub-Token"];
+  }
+};
diff --git a/frontend/src/api/open-hands.ts b/frontend/src/api/open-hands.ts
@@ -1,4 +1,3 @@
-import { request } from "#/services/api";
 import {
   SaveFileSuccessResponse,
   FileUploadSuccessResponse,
@@ -8,103 +7,67 @@ import {
   ErrorResponse,
   GetConfigResponse,
   GetVSCodeUrlResponse,
+  AuthenticateResponse,
 } from "./open-hands.types";
+import { openHands } from "./open-hands-axios";
 
 class OpenHands {
   /**
    * Retrieve the list of models available
    * @returns List of models available
    */
   static async getModels(): Promise<string[]> {
-    const response = await fetch("/api/options/models");
-
-    if (!response.ok) {
-      throw new Error("Failed to fetch models");
-    }
-
-    return response.json();
+    const { data } = await openHands.get<string[]>("/api/options/models");
+    return data;
   }
 
   /**
    * Retrieve the list of agents available
    * @returns List of agents available
    */
   static async getAgents(): Promise<string[]> {
-    const response = await fetch("/api/options/agents");
-
-    if (!response.ok) {
-      throw new Error("Failed to fetch agents");
-    }
-
-    return response.json();
+    const { data } = await openHands.get<string[]>("/api/options/agents");
+    return data;
   }
 
   /**
    * Retrieve the list of security analyzers available
    * @returns List of security analyzers available
    */
   static async getSecurityAnalyzers(): Promise<string[]> {
-    const response = await fetch("/api/options/security-analyzers");
-
-    if (!response.ok) {
-      throw new Error("Failed to fetch security analyzers");
-    }
-
-    return response.json();
+    const { data } = await openHands.get<string[]>(
+      "/api/options/security-analyzers",
+    );
+    return data;
   }
 
   static async getConfig(): Promise<GetConfigResponse> {
-    const response = await fetch("/config.json");
-
-    if (!response.ok) {
-      throw new Error("Failed to fetch config");
-    }
-
-    return response.json();
+    const { data } = await openHands.get<GetConfigResponse>("/config.json");
+    return data;
   }
 
   /**
    * Retrieve the list of files available in the workspace
    * @param path Path to list files from
    * @returns List of files available in the given path. If path is not provided, it lists all the files in the workspace
    */
-  static async getFiles(token: string, path?: string): Promise<string[]> {
-    const url = new URL("/api/list-files", window.location.origin);
-    if (path) url.searchParams.append("path", path);
-
-    const response = await fetch(url.toString(), {
-      headers: {
-        Authorization: `Bearer ${token}`,
-      },
+  static async getFiles(path?: string): Promise<string[]> {
+    const { data } = await openHands.get<string[]>("/api/list-files", {
+      params: { path },
     });
-
-    if (!response.ok) {
-      throw new Error("Failed to fetch files");
-    }
-
-    return response.json();
+    return data;
   }
 
   /**
    * Retrieve the content of a file
    * @param path Full path of the file to retrieve
    * @returns Content of the file
    */
-  static async getFile(token: string, path: string): Promise<string> {
-    const url = new URL("/api/select-file", window.location.origin);
-    url.searchParams.append("file", path);
-
-    const response = await fetch(url.toString(), {
-      headers: {
-        Authorization: `Bearer ${token}`,
-      },
+  static async getFile(path: string): Promise<string> {
+    const { data } = await openHands.get<{ code: string }>("/api/select-file", {
+      params: { file: path },
     });
 
-    if (!response.ok) {
-      throw new Error("Failed to fetch file");
-    }
-
-    const data = await response.json();
     return data.code;
   }
 
@@ -115,31 +78,17 @@ class OpenHands {
    * @returns Success message or error message
    */
   static async saveFile(
-    token: string,
     path: string,
     content: string,
   ): Promise<SaveFileSuccessResponse> {
-    const response = await fetch("/api/save-file", {
-      method: "POST",
-      body: JSON.stringify({ filePath: path, content }),
-      headers: {
-        "Content-Type": "application/json",
-        Authorization: `Bearer ${token}`,
-      },
+    const { data } = await openHands.post<
+      SaveFileSuccessResponse | ErrorResponse
+    >("/api/save-file", {
+      filePath: path,
+      content,
     });
 
-    if (!response.ok) {
-      throw new Error("Failed to save file");
-    }
-
-    const data = (await response.json()) as
-      | SaveFileSuccessResponse
-      | ErrorResponse;
-
-    if ("error" in data) {
-      throw new Error(data.error);
-    }
-
+    if ("error" in data) throw new Error(data.error);
     return data;
   }
 
@@ -148,33 +97,15 @@ class OpenHands {
    * @param file File to upload
    * @returns Success message or error message
    */
-  static async uploadFiles(
-    token: string,
-    files: File[],
-  ): Promise<FileUploadSuccessResponse> {
+  static async uploadFiles(files: File[]): Promise<FileUploadSuccessResponse> {
     const formData = new FormData();
     files.forEach((file) => formData.append("files", file));
 
-    const response = await fetch("/api/upload-files", {
-      method: "POST",
-      body: formData,
-      headers: {
-        Authorization: `Bearer ${token}`,
-      },
-    });
-
-    if (!response.ok) {
-      throw new Error("Failed to upload files");
-    }
-
-    const data = (await response.json()) as
-      | FileUploadSuccessResponse
-      | ErrorResponse;
-
-    if ("error" in data) {
-      throw new Error(data.error);
-    }
+    const { data } = await openHands.post<
+      FileUploadSuccessResponse | ErrorResponse
+    >("/api/upload-files", formData);
 
+    if ("error" in data) throw new Error(data.error);
     return data;
   }
 
@@ -183,53 +114,37 @@ class OpenHands {
    * @param data Feedback data
    * @returns The stored feedback data
    */
-  static async submitFeedback(
-    token: string,
-    feedback: Feedback,
-  ): Promise<FeedbackResponse> {
-    const response = await fetch("/api/submit-feedback", {
-      method: "POST",
-      body: JSON.stringify(feedback),
-      headers: {
-        "Content-Type": "application/json",
-        Authorization: `Bearer ${token}`,
-      },
-    });
-
-    if (!response.ok) {
-      throw new Error("Failed to submit feedback");
-    }
-
-    return response.json();
+  static async submitFeedback(feedback: Feedback): Promise<FeedbackResponse> {
+    const { data } = await openHands.post<FeedbackResponse>(
+      "/api/submit-feedback",
+      feedback,
+    );
+    return data;
   }
 
   /**
    * Authenticate with GitHub token
    * @returns Response with authentication status and user info if successful
    */
   static async authenticate(
-    gitHubToken: string,
     appMode: GetConfigResponse["APP_MODE"],
   ): Promise<boolean> {
     if (appMode === "oss") return true;
 
-    const response = await fetch("/api/authenticate", {
-      method: "POST",
-      headers: {
-        "X-GitHub-Token": gitHubToken,
-      },
-    });
-
-    return response.ok;
+    const response =
+      await openHands.post<AuthenticateResponse>("/api/authenticate");
+    return response.status === 200;
   }
 
   /**
    * Get the blob of the workspace zip
    * @returns Blob of the workspace zip
    */
   static async getWorkspaceZip(): Promise<Blob> {
-    const response = await request(`/api/zip-directory`, {}, false, true);
-    return response.blob();
+    const response = await openHands.get("/api/zip-directory", {
+      responseType: "blob",
+    });
+    return response.data;
   }
 
   /**
@@ -239,32 +154,29 @@ class OpenHands {
   static async getGitHubAccessToken(
     code: string,
   ): Promise<GitHubAccessTokenResponse> {
-    const response = await fetch("/api/github/callback", {
-      method: "POST",
-      body: JSON.stringify({ code }),
-      headers: {
-        "Content-Type": "application/json",
+    const { data } = await openHands.post<GitHubAccessTokenResponse>(
+      "/api/github/callback",
+      {
+        code,
       },
-    });
-
-    if (!response.ok) {
-      throw new Error("Failed to get GitHub access token");
-    }
-
-    return response.json();
+    );
+    return data;
   }
 
   /**
    * Get the VSCode URL
    * @returns VSCode URL
    */
   static async getVSCodeUrl(): Promise<GetVSCodeUrlResponse> {
-    return request(`/api/vscode-url`, {}, false, false, 1);
+    const { data } =
+      await openHands.get<GetVSCodeUrlResponse>("/api/vscode-url");
+    return data;
   }
 
   static async getRuntimeId(): Promise<{ runtime_id: string }> {
-    const data = await request("/api/conversation");
-
+    const { data } = await openHands.get<{ runtime_id: string }>(
+      "/api/conversation",
+    );
     return data;
   }
 }

diff --git a/frontend/src/api/open-hands.types.ts b/frontend/src/api/open-hands.types.ts
@@ -51,3 +51,8 @@ export interface GetVSCodeUrlResponse {
   vscode_url: string | null;
   error?: string;
 }
+
+export interface AuthenticateResponse {
+  message?: string;
+  error?: string;
+}
diff --git a/frontend/src/components/agent-control-bar.tsx b/frontend/src/components/agent-control-bar.tsx