Skip to content

Commit

Permalink
feat: adding swe-bench docker to improve evaluation (#246)
Browse files Browse the repository at this point in the history
- adds swe-bench-docker repo code to improve and run evaluation on
docker
tasks:
- [ ] add evaluation function as part of run_eval script
- [ ] build docker images and push it to public docker repo 
- [ ] use the same docker image to run composio-swe

---------

Co-authored-by: Viraj <35092918+angrybayblade@users.noreply.github.com>
Co-authored-by: Karan Vaidya <kaavee315@gmail.com>
  • Loading branch information
3 people authored Jul 4, 2024
1 parent 3fbeedd commit 14e1c85
Show file tree
Hide file tree
Showing 29 changed files with 3,382 additions and 44 deletions.
2 changes: 1 addition & 1 deletion python/composio/tools/env/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def new(self) -> Shell:

def get(self, id: t.Optional[str] = None) -> Shell:
"""Get shell instance."""
if id is None:
if id is None or id == "":
return self.recent
if id not in self._shells:
raise ComposioSDKError(
Expand Down
4 changes: 4 additions & 0 deletions python/composio/tools/env/docker/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import os
import typing as t
from composio.utils.logging import get as get_logger

from docker import DockerClient, from_env
from docker.errors import DockerException
Expand All @@ -27,7 +28,10 @@ class DockerWorkspace(Workspace):
def __init__(self, image: t.Optional[str] = None) -> None:
"""Create a docker workspace."""
self.id = generate_id()
logger = get_logger(name="docker_workspace")
logger.info(f"Creating docker workspace with image: {image}")
self._image = image or os.environ.get("COMPOSIO_SWE_AGENT", DEFAULT_IMAGE)
logger.info(f"Using image: {self._image}")
self._container = self.client.containers.run(
image=self._image,
command="/bin/bash -l -m",
Expand Down
2 changes: 2 additions & 0 deletions python/composio/tools/env/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def new(cls, env: ExecEnv, **kwargs: t.Any) -> Workspace:
if env == ExecEnv.HOST:
workspace = HostWorkspace(**kwargs)
elif env == ExecEnv.DOCKER:
logger = get_logger(name="workspace_factory_new")
logger.info(f"Creating docker workspace with kwargs: {kwargs}")
workspace = DockerWorkspace(**kwargs)
else:
raise ComposioSDKError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,7 @@ def execute(
authorisation_data=authorisation_data,
shell_id=request_data.shell_id,
)
return ShellExecResponse(stdout=output["stdout"], stderr=output["stderr"])
return ShellExecResponse(
stdout="Check git_repo_tree.txt for the git-repo-tree results. Use Open File function to check the file.",
stderr=output["stderr"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_git_workflow(self):
{},
)
self.assertIsNotNone(get_patch_result)
self.assertIsInstance(get_patch_result, tuple)
self.assertIsInstance(get_patch_result, BaseResponse)
self.assertIsInstance(tuple(get_patch_result)[0], tuple)
patch_content = (
tuple(tuple(get_patch_result)[0])[1]
Expand Down
4 changes: 2 additions & 2 deletions python/swe/benchmark/get_score_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def save_summaries_to_file(predictions_dir, predictions_path, log_dir, scorecard
logging.info("- Wrote summary of run to: %s", results_path)


def main(predictions_dir, log_dir, swe_bench_path, model):
def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
logging.info("Starting main function")
eval_refs, _ = get_cur_eval_refs(predictions_dir, swe_bench_path)
predictions_path = predictions_dir / Path(PATH_PATCHES_JSON)
Expand Down Expand Up @@ -201,7 +201,7 @@ def main(predictions_dir, log_dir, swe_bench_path, model):
testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
if not os.path.exists(testbed_dir):
os.makedirs(testbed_dir)
main(
generate_scorecard(
predictions_dir=prediction_path_dir,
log_dir=str(args.log_dir),
swe_bench_path=args.swe_bench_path,
Expand Down
125 changes: 89 additions & 36 deletions python/swe/benchmark/run_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,36 @@
# pylint: disable=logging-fstring-interpolation

import argparse
import asyncio
import datetime
import logging
import os
from pathlib import Path
from benchmark.constants import MODEL_GPT4
from benchmark.get_score_card import generate_scorecard
from benchmark.setup_test_bed import create_patches_file

from composio_crewai import ComposioToolSet
from composio_swe.config.constants import KEY_API_KEY
from composio_swe.config.context import Context, set_context
from composio_swe.config.constants import (
KEY_API_KEY,
LOCAL_CACHE_DIRECTORY_NAME,
LOGS_DIR,
)
from composio_swe.config.context import Context, get_context, set_context
from composio_swe.config.store import IssueConfig
from datasets import load_dataset
from rich.logging import RichHandler

from composio import Action, Composio
from composio.tools.env.factory import ExecEnv, WorkspaceFactory
from examples.crewai_agent import CrewaiAgent, SWEArgs
from swe.examples.crewai_agent import CrewaiAgent, SWEArgs
from swe.swe_bench_docker.evaulate_on_docker import EvaluateOnDockerArgs, evaluate


# get logger
LOGGER_NAME = "local_workspace"
DATASET_NAME = "princeton-nlp/SWE-bench_Lite"
PATH_TESTBED = "testbed/"

handler = RichHandler(show_time=False, show_path=False)
handler.setLevel(logging.DEBUG)
Expand All @@ -26,36 +40,41 @@
logger.propagate = False


# princeton swe bench lite dataset has these fields
# instance_id: (str) - A formatted instance identifier, usually as repo_owner__repo_name-PR-number.
# patch: (str) - The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue.
# repo: (str) - The repository owner/name identifier from GitHub.
# base_commit: (str) - The commit hash of the repository representing the HEAD of the repository before the solution PR is applied.
# hints_text: (str) - Comments made on the issue prior to the creation of the solution PR's first commit creation date.
# created_at: (str) - The creation date of the pull request.
# test_patch: (str) - A test-file patch that was contributed by the solution PR.
# problem_statement: (str) - The issue title and body.
# version: (str) - Installation version to use for running evaluation.
# environment_setup_commit: (str) - commit hash to use for environment setup and installation.
# FAIL_TO_PASS: (str) - A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution.
# PASS_TO_PASS: (str) - A json list of strings that represent tests that should pass before and after the PR application.


def filter_from_repo_name(curr_dataset, repo_name):
filtered_dataset = curr_dataset.filter(
lambda x: x["repo"] == repo_name.strip().lower()
)
return filtered_dataset


def get_issues_dataset(test_split):
test_dataset = load_dataset(
"princeton-nlp/SWE-bench_Lite",
DATASET_NAME,
split=f"test[{test_split}]",
)
return test_dataset


def get_score(logs_dir=None):
ctx = get_context()
if logs_dir is None:
logs_dir = ctx.agent_logs_dir
prediction_patches_path = create_patches_file(logs_dir, DATASET_NAME)
print("logs dir: ", logs_dir)
print("prediction_patches_path: ", prediction_patches_path)
evaluate_args = EvaluateOnDockerArgs(
predictions_path=str(prediction_patches_path),
# docker_dir="./docker",
swe_bench_tasks=DATASET_NAME,
namespace="aorwall",
log_dir=str(logs_dir),
)
asyncio.run(evaluate(**evaluate_args.model_dump()))
prediction_path_dir = Path(prediction_patches_path).parent
testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
if not os.path.exists(testbed_dir):
os.makedirs(testbed_dir)
generate_scorecard(
predictions_dir=prediction_path_dir,
log_dir=str(logs_dir),
swe_bench_path=f"{logs_dir}/dataset",
model=MODEL_GPT4,
)


def build_issue_description(hints, problem_statement, include_hints):
if not problem_statement or not problem_statement.strip():
raise ValueError("problem statement is empty")
Expand Down Expand Up @@ -98,20 +117,30 @@ def create_workspace_from_image(repo, repo_to_image_id_map, base_commit):
workspace_id = workspace.id
workspace_creation_time = datetime.datetime.now() - start_time
composio_toolset = ComposioToolSet(workspace_id=workspace_id)
cd_resp = composio_toolset.execute_action(
action=Action.SHELL_EXECUTE_COMMAND,
params={
"cmd": f"cd /{repo.split('/')[-1]}",
},
)
if isinstance(cd_resp, dict) and cd_resp.get("status") == "failure":
raise Exception(f"Error changing directory: {cd_resp['details']}")
logger.info(
"workspace is created, workspace-id is: %s, creation time: %s",
workspace_id,
workspace_creation_time,
)
logger.info("Resetting repository to base commit")
composio_toolset.execute_action(
reset_resp = composio_toolset.execute_action(
action=Action.GITCMDTOOL_GITHUB_CLONE_CMD,
params={
"repo_name": repo,
"just_reset": True,
"commit_id": base_commit,
},
)
if isinstance(reset_resp, dict) and reset_resp.get("status") == "failure":
raise Exception(f"Error resetting repository: {reset_resp['details']}")
return workspace_id


Expand Down Expand Up @@ -167,22 +196,23 @@ def setup_workspace(repo, repo_to_workspace_map, repo_to_image_id_map, base_comm
)


def run(test_split, print_only=False, include_hints=True):
def run(test_split, print_only=False, include_hints=True, logs_dir=None):
"""
Main function to load and display entries from the SWE-bench lite dataset.
"""

issues = get_issues_dataset(test_split)

repo_to_workspace_map = {}
repo_to_image_id_map = {""}
repo_to_image_id_map = {
"django/django": "techcomposio/swe-bench-django_django",
"astropy/astropy": "kaavee315/astropy_astropy",
}
for count, issue in enumerate(issues, 1):
try:
repo = issue["repo"]
print(f"Processing {count}th issue with repoMap: {repo_to_workspace_map}")
print(f"Repo: {repo}")
print(f"Issue id: {issue['instance_id']}")
print(f"Issue description: {issue['problem_statement']}")

if print_only:
if include_hints:
Expand All @@ -197,7 +227,10 @@ def run(test_split, print_only=False, include_hints=True):
issue_description = build_issue_description(
issue["hints_text"], issue["problem_statement"], include_hints
)
print(f"Issue description: {issue_description}")
print("Issue description (first 10 lines):")
for line in issue_description.split("\n")[:10]:
print(line)
print("...")
patch = issue["patch"]
install_commit_id = issue["environment_setup_commit"]
logger.info(
Expand Down Expand Up @@ -227,7 +260,7 @@ def run(test_split, print_only=False, include_hints=True):
ctx.model_env = model_env_config
set_context(ctx)

args = SWEArgs(agent_logs_dir=ctx.agent_logs_dir)
args = SWEArgs(agent_logs_dir=logs_dir or ctx.agent_logs_dir)
coder = CrewaiAgent(args=args, workspace_id=workspace_id)
coder.setup_and_solve(
issue_config=ctx.issue_config, workspace_id=workspace_id
Expand All @@ -245,7 +278,7 @@ def run(test_split, print_only=False, include_hints=True):
parser.add_argument(
"--test_split",
type=str,
default="1:10",
default="20:40",
help="Test split range (e.g., 1:10)",
)
parser.add_argument(
Expand All @@ -258,7 +291,27 @@ def run(test_split, print_only=False, include_hints=True):
action="store_true",
help="Include hints in the issue description",
)
parser.add_argument(
"--gen_report",
action="store_true",
default=False,
help="Generate a report after running evaluations",
)
parser.add_argument(
"--logs_dir",
type=str,
default=f"{Path.home()}/{LOCAL_CACHE_DIRECTORY_NAME}/{LOGS_DIR}/{int(datetime.datetime.now().timestamp())}",
help="Logs directory",
)

args = parser.parse_args()

print("Starting evaluation")
run(args.test_split, args.print_only, args.include_hints)
# Make the log directory if it doesn't exist
logs_dir = Path(args.logs_dir)
if not logs_dir.exists():
logs_dir.mkdir(parents=True)

print("Starting evaluation with gen_report: ", args.gen_report)
run(args.test_split, args.print_only, args.include_hints, args.logs_dir)
if args.gen_report:
get_score(args.logs_dir)
5 changes: 3 additions & 2 deletions python/swe/benchmark/setup_test_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def log_file(f_name):
return False


def main(predictions_dir, dataset_path_or_name):
def create_patches_file(predictions_dir, dataset_path_or_name):
all_patches = []
pred_total, pred_will_eval = 0, 0
download_and_store_dataset(
Expand Down Expand Up @@ -109,6 +109,7 @@ def main(predictions_dir, dataset_path_or_name):
print(
f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
)
return pred_path_orig


if __name__ == "__main__":
Expand All @@ -132,7 +133,7 @@ def main(predictions_dir, dataset_path_or_name):
script_path = Path(__file__)
script_dir = script_path.parent
prediction_path_dir = Path(args.prediction_path_dir)
main(
create_patches_file(
predictions_dir=prediction_path_dir,
dataset_path_or_name=args.dataset_path_or_name,
)
1 change: 1 addition & 0 deletions python/swe/composio_swe/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def save(self, instance_id: str) -> None:
"""Save current history state."""
self.agent_logs[instance_id] = self.current_logs
with open(self.task_output_logs, "w", encoding="utf-8") as f:
self.logger.info(f"Saving logs to {self.task_output_logs}")
f.write(json.dumps(self.agent_logs))

def setup_and_solve(
Expand Down
2 changes: 1 addition & 1 deletion python/swe/examples/crewai_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, args: SWEArgs, workspace_id: str) -> None:
apps=[
App.SEARCHTOOL,
App.GITCMDTOOL,
App.FILETOOL,
App.FILEEDITTOOL,
App.HISTORYFETCHERTOOL,
]
)
Expand Down
23 changes: 23 additions & 0 deletions python/swe/swe_bench_docker/THIRD-PARTY-LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
For the third-party code(python/swe/swe_bench_docker) used in this project, we have included the following licenses:

MIT License

Copyright (c) 2024 Albert Örwall

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Empty file.
Loading

0 comments on commit 14e1c85

Please sign in to comment.