feat: adding swe-bench docker to improve evaluation (#246)

- adds swe-bench-docker repo code to improve and run evaluation on docker tasks: - [ ] add evaluation function as part of run_eval script - [ ] build docker images and push it to public docker repo - [ ] use the same docker image to run composio-swe --------- Co-authored-by: Viraj <35092918+angrybayblade@users.noreply.github.com> Co-authored-by: Karan Vaidya <kaavee315@gmail.com>
ComposioHQ · Jul 4, 2024 · 14e1c85 · 14e1c85
1 parent 3fbeedd
commit 14e1c85
Show file tree

Hide file tree

Showing 29 changed files with 3,382 additions and 44 deletions.
diff --git a/python/composio/tools/env/base.py b/python/composio/tools/env/base.py
@@ -81,7 +81,7 @@ def new(self) -> Shell:
 
     def get(self, id: t.Optional[str] = None) -> Shell:
         """Get shell instance."""
-        if id is None:
+        if id is None or id == "":
             return self.recent
         if id not in self._shells:
             raise ComposioSDKError(

diff --git a/python/composio/tools/env/docker/workspace.py b/python/composio/tools/env/docker/workspace.py
@@ -4,6 +4,7 @@
 
 import os
 import typing as t
+from composio.utils.logging import get as get_logger
 
 from docker import DockerClient, from_env
 from docker.errors import DockerException
@@ -27,7 +28,10 @@ class DockerWorkspace(Workspace):
     def __init__(self, image: t.Optional[str] = None) -> None:
         """Create a docker workspace."""
         self.id = generate_id()
+        logger = get_logger(name="docker_workspace")
+        logger.info(f"Creating docker workspace with image: {image}")
         self._image = image or os.environ.get("COMPOSIO_SWE_AGENT", DEFAULT_IMAGE)
+        logger.info(f"Using image: {self._image}")
         self._container = self.client.containers.run(
             image=self._image,
             command="/bin/bash -l -m",

diff --git a/python/composio/tools/env/factory.py b/python/composio/tools/env/factory.py
@@ -61,6 +61,8 @@ def new(cls, env: ExecEnv, **kwargs: t.Any) -> Workspace:
         if env == ExecEnv.HOST:
             workspace = HostWorkspace(**kwargs)
         elif env == ExecEnv.DOCKER:
+            logger = get_logger(name="workspace_factory_new")
+            logger.info(f"Creating docker workspace with kwargs: {kwargs}")
             workspace = DockerWorkspace(**kwargs)
         else:
             raise ComposioSDKError(

diff --git a/python/composio/tools/local/shelltool/git_cmds/actions/git_tree.py b/python/composio/tools/local/shelltool/git_cmds/actions/git_tree.py
@@ -33,4 +33,7 @@ def execute(
             authorisation_data=authorisation_data,
             shell_id=request_data.shell_id,
         )
-        return ShellExecResponse(stdout=output["stdout"], stderr=output["stderr"])
+        return ShellExecResponse(
+            stdout="Check git_repo_tree.txt for the git-repo-tree results. Use Open File function to check the file.",
+            stderr=output["stderr"],
+        )
diff --git a/python/composio/tools/local/shelltool/tests/test_workspace.py b/python/composio/tools/local/shelltool/tests/test_workspace.py
@@ -110,7 +110,7 @@ def test_git_workflow(self):
             {},
         )
         self.assertIsNotNone(get_patch_result)
-        self.assertIsInstance(get_patch_result, tuple)
+        self.assertIsInstance(get_patch_result, BaseResponse)
         self.assertIsInstance(tuple(get_patch_result)[0], tuple)
         patch_content = (
             tuple(tuple(get_patch_result)[0])[1]

diff --git a/python/swe/benchmark/get_score_card.py b/python/swe/benchmark/get_score_card.py
@@ -81,7 +81,7 @@ def save_summaries_to_file(predictions_dir, predictions_path, log_dir, scorecard
     logging.info("- Wrote summary of run to: %s", results_path)
 
 
-def main(predictions_dir, log_dir, swe_bench_path, model):
+def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
     logging.info("Starting main function")
     eval_refs, _ = get_cur_eval_refs(predictions_dir, swe_bench_path)
     predictions_path = predictions_dir / Path(PATH_PATCHES_JSON)
@@ -201,7 +201,7 @@ def main(predictions_dir, log_dir, swe_bench_path, model):
     testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
     if not os.path.exists(testbed_dir):
         os.makedirs(testbed_dir)
-    main(
+    generate_scorecard(
         predictions_dir=prediction_path_dir,
         log_dir=str(args.log_dir),
         swe_bench_path=args.swe_bench_path,

diff --git a/python/swe/benchmark/run_evaluation.py b/python/swe/benchmark/run_evaluation.py
@@ -1,22 +1,36 @@
 # pylint: disable=logging-fstring-interpolation
+
 import argparse
+import asyncio
 import datetime
 import logging
+import os
+from pathlib import Path
+from benchmark.constants import MODEL_GPT4
+from benchmark.get_score_card import generate_scorecard
+from benchmark.setup_test_bed import create_patches_file
 
 from composio_crewai import ComposioToolSet
-from composio_swe.config.constants import KEY_API_KEY
-from composio_swe.config.context import Context, set_context
+from composio_swe.config.constants import (
+    KEY_API_KEY,
+    LOCAL_CACHE_DIRECTORY_NAME,
+    LOGS_DIR,
+)
+from composio_swe.config.context import Context, get_context, set_context
 from composio_swe.config.store import IssueConfig
 from datasets import load_dataset
 from rich.logging import RichHandler
 
 from composio import Action, Composio
 from composio.tools.env.factory import ExecEnv, WorkspaceFactory
-from examples.crewai_agent import CrewaiAgent, SWEArgs
+from swe.examples.crewai_agent import CrewaiAgent, SWEArgs
+from swe.swe_bench_docker.evaulate_on_docker import EvaluateOnDockerArgs, evaluate
 
 
 # get logger
 LOGGER_NAME = "local_workspace"
+DATASET_NAME = "princeton-nlp/SWE-bench_Lite"
+PATH_TESTBED = "testbed/"
 
 handler = RichHandler(show_time=False, show_path=False)
 handler.setLevel(logging.DEBUG)
@@ -26,36 +40,41 @@
 logger.propagate = False
 
 
-# princeton swe bench lite dataset has these fields
-# instance_id: (str) - A formatted instance identifier, usually as repo_owner__repo_name-PR-number.
-# patch: (str) - The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue.
-# repo: (str) - The repository owner/name identifier from GitHub.
-# base_commit: (str) - The commit hash of the repository representing the HEAD of the repository before the solution PR is applied.
-# hints_text: (str) - Comments made on the issue prior to the creation of the solution PR's first commit creation date.
-# created_at: (str) - The creation date of the pull request.
-# test_patch: (str) - A test-file patch that was contributed by the solution PR.
-# problem_statement: (str) - The issue title and body.
-# version: (str) - Installation version to use for running evaluation.
-# environment_setup_commit: (str) - commit hash to use for environment setup and installation.
-# FAIL_TO_PASS: (str) - A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution.
-# PASS_TO_PASS: (str) - A json list of strings that represent tests that should pass before and after the PR application.
-
-
-def filter_from_repo_name(curr_dataset, repo_name):
-    filtered_dataset = curr_dataset.filter(
-        lambda x: x["repo"] == repo_name.strip().lower()
-    )
-    return filtered_dataset
-
-
 def get_issues_dataset(test_split):
     test_dataset = load_dataset(
-        "princeton-nlp/SWE-bench_Lite",
+        DATASET_NAME,
         split=f"test[{test_split}]",
     )
     return test_dataset
 
 
+def get_score(logs_dir=None):
+    ctx = get_context()
+    if logs_dir is None:
+        logs_dir = ctx.agent_logs_dir
+    prediction_patches_path = create_patches_file(logs_dir, DATASET_NAME)
+    print("logs dir: ", logs_dir)
+    print("prediction_patches_path: ", prediction_patches_path)
+    evaluate_args = EvaluateOnDockerArgs(
+        predictions_path=str(prediction_patches_path),
+        # docker_dir="./docker",
+        swe_bench_tasks=DATASET_NAME,
+        namespace="aorwall",
+        log_dir=str(logs_dir),
+    )
+    asyncio.run(evaluate(**evaluate_args.model_dump()))
+    prediction_path_dir = Path(prediction_patches_path).parent
+    testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
+    if not os.path.exists(testbed_dir):
+        os.makedirs(testbed_dir)
+    generate_scorecard(
+        predictions_dir=prediction_path_dir,
+        log_dir=str(logs_dir),
+        swe_bench_path=f"{logs_dir}/dataset",
+        model=MODEL_GPT4,
+    )
+
+
 def build_issue_description(hints, problem_statement, include_hints):
     if not problem_statement or not problem_statement.strip():
         raise ValueError("problem statement is empty")
@@ -98,20 +117,30 @@ def create_workspace_from_image(repo, repo_to_image_id_map, base_commit):
     workspace_id = workspace.id
     workspace_creation_time = datetime.datetime.now() - start_time
     composio_toolset = ComposioToolSet(workspace_id=workspace_id)
+    cd_resp = composio_toolset.execute_action(
+        action=Action.SHELL_EXECUTE_COMMAND,
+        params={
+            "cmd": f"cd /{repo.split('/')[-1]}",
+        },
+    )
+    if isinstance(cd_resp, dict) and cd_resp.get("status") == "failure":
+        raise Exception(f"Error changing directory: {cd_resp['details']}")
     logger.info(
         "workspace is created, workspace-id is: %s, creation time: %s",
         workspace_id,
         workspace_creation_time,
     )
     logger.info("Resetting repository to base commit")
-    composio_toolset.execute_action(
+    reset_resp = composio_toolset.execute_action(
         action=Action.GITCMDTOOL_GITHUB_CLONE_CMD,
         params={
             "repo_name": repo,
             "just_reset": True,
             "commit_id": base_commit,
         },
     )
+    if isinstance(reset_resp, dict) and reset_resp.get("status") == "failure":
+        raise Exception(f"Error resetting repository: {reset_resp['details']}")
     return workspace_id
 
 
@@ -167,22 +196,23 @@ def setup_workspace(repo, repo_to_workspace_map, repo_to_image_id_map, base_comm
     )
 
 
-def run(test_split, print_only=False, include_hints=True):
+def run(test_split, print_only=False, include_hints=True, logs_dir=None):
     """
     Main function to load and display entries from the SWE-bench lite dataset.
     """
 
     issues = get_issues_dataset(test_split)
-
     repo_to_workspace_map = {}
-    repo_to_image_id_map = {""}
+    repo_to_image_id_map = {
+        "django/django": "techcomposio/swe-bench-django_django",
+        "astropy/astropy": "kaavee315/astropy_astropy",
+    }
     for count, issue in enumerate(issues, 1):
         try:
             repo = issue["repo"]
             print(f"Processing {count}th issue with repoMap: {repo_to_workspace_map}")
             print(f"Repo: {repo}")
             print(f"Issue id: {issue['instance_id']}")
-            print(f"Issue description: {issue['problem_statement']}")
 
             if print_only:
                 if include_hints:
@@ -197,7 +227,10 @@ def run(test_split, print_only=False, include_hints=True):
             issue_description = build_issue_description(
                 issue["hints_text"], issue["problem_statement"], include_hints
             )
-            print(f"Issue description: {issue_description}")
+            print("Issue description (first 10 lines):")
+            for line in issue_description.split("\n")[:10]:
+                print(line)
+            print("...")
             patch = issue["patch"]
             install_commit_id = issue["environment_setup_commit"]
             logger.info(
@@ -227,7 +260,7 @@ def run(test_split, print_only=False, include_hints=True):
             ctx.model_env = model_env_config
             set_context(ctx)
 
-            args = SWEArgs(agent_logs_dir=ctx.agent_logs_dir)
+            args = SWEArgs(agent_logs_dir=logs_dir or ctx.agent_logs_dir)
             coder = CrewaiAgent(args=args, workspace_id=workspace_id)
             coder.setup_and_solve(
                 issue_config=ctx.issue_config, workspace_id=workspace_id
@@ -245,7 +278,7 @@ def run(test_split, print_only=False, include_hints=True):
     parser.add_argument(
         "--test_split",
         type=str,
-        default="1:10",
+        default="20:40",
         help="Test split range (e.g., 1:10)",
     )
     parser.add_argument(
@@ -258,7 +291,27 @@ def run(test_split, print_only=False, include_hints=True):
         action="store_true",
         help="Include hints in the issue description",
     )
+    parser.add_argument(
+        "--gen_report",
+        action="store_true",
+        default=False,
+        help="Generate a report after running evaluations",
+    )
+    parser.add_argument(
+        "--logs_dir",
+        type=str,
+        default=f"{Path.home()}/{LOCAL_CACHE_DIRECTORY_NAME}/{LOGS_DIR}/{int(datetime.datetime.now().timestamp())}",
+        help="Logs directory",
+    )
+
     args = parser.parse_args()
 
-    print("Starting evaluation")
-    run(args.test_split, args.print_only, args.include_hints)
+    # Make the log directory if it doesn't exist
+    logs_dir = Path(args.logs_dir)
+    if not logs_dir.exists():
+        logs_dir.mkdir(parents=True)
+
+    print("Starting evaluation with gen_report: ", args.gen_report)
+    run(args.test_split, args.print_only, args.include_hints, args.logs_dir)
+    if args.gen_report:
+        get_score(args.logs_dir)
diff --git a/python/swe/benchmark/setup_test_bed.py b/python/swe/benchmark/setup_test_bed.py
@@ -62,7 +62,7 @@ def log_file(f_name):
     return False
 
 
-def main(predictions_dir, dataset_path_or_name):
+def create_patches_file(predictions_dir, dataset_path_or_name):
     all_patches = []
     pred_total, pred_will_eval = 0, 0
     download_and_store_dataset(
@@ -109,6 +109,7 @@ def main(predictions_dir, dataset_path_or_name):
     print(
         f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
     )
+    return pred_path_orig
 
 
 if __name__ == "__main__":
@@ -132,7 +133,7 @@ def main(predictions_dir, dataset_path_or_name):
     script_path = Path(__file__)
     script_dir = script_path.parent
     prediction_path_dir = Path(args.prediction_path_dir)
-    main(
+    create_patches_file(
         predictions_dir=prediction_path_dir,
         dataset_path_or_name=args.dataset_path_or_name,
     )
diff --git a/python/swe/composio_swe/agents/base.py b/python/swe/composio_swe/agents/base.py
@@ -64,6 +64,7 @@ def save(self, instance_id: str) -> None:
         """Save current history state."""
         self.agent_logs[instance_id] = self.current_logs
         with open(self.task_output_logs, "w", encoding="utf-8") as f:
+            self.logger.info(f"Saving logs to {self.task_output_logs}")
             f.write(json.dumps(self.agent_logs))
 
     def setup_and_solve(

diff --git a/python/swe/examples/crewai_agent.py b/python/swe/examples/crewai_agent.py
@@ -24,7 +24,7 @@ def __init__(self, args: SWEArgs, workspace_id: str) -> None:
             apps=[
                 App.SEARCHTOOL,
                 App.GITCMDTOOL,
-                App.FILETOOL,
+                App.FILEEDITTOOL,
                 App.HISTORYFETCHERTOOL,
             ]
         )

diff --git a/python/swe/swe_bench_docker/THIRD-PARTY-LICENSE b/python/swe/swe_bench_docker/THIRD-PARTY-LICENSE
@@ -0,0 +1,23 @@
+For the third-party code(python/swe/swe_bench_docker) used in this project, we have included the following licenses:
+
+MIT License
+
+Copyright (c) 2024 Albert Örwall
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/python/swe/swe_bench_docker/__init__.py b/python/swe/swe_bench_docker/__init__.py