add generate report card

ComposioHQ · Jul 2, 2024 · c1372a9 · c1372a9
1 parent 54854cd
commit c1372a9
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 29 deletions.
diff --git a/python/swe/benchmark/get_score_card.py b/python/swe/benchmark/get_score_card.py
@@ -81,7 +81,7 @@ def save_summaries_to_file(predictions_dir, predictions_path, log_dir, scorecard
     logging.info("- Wrote summary of run to: %s", results_path)
 
 
-def main(predictions_dir, log_dir, swe_bench_path, model):
+def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
     logging.info("Starting main function")
     eval_refs, _ = get_cur_eval_refs(predictions_dir, swe_bench_path)
     predictions_path = predictions_dir / Path(PATH_PATCHES_JSON)
@@ -201,7 +201,7 @@ def main(predictions_dir, log_dir, swe_bench_path, model):
     testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
     if not os.path.exists(testbed_dir):
         os.makedirs(testbed_dir)
-    main(
+    generate_scorecard(
         predictions_dir=prediction_path_dir,
         log_dir=str(args.log_dir),
         swe_bench_path=args.swe_bench_path,

diff --git a/python/swe/benchmark/run_evaluation.py b/python/swe/benchmark/run_evaluation.py
@@ -3,21 +3,30 @@
 import argparse
 import datetime
 import logging
+import asyncio
+from pathlib import Path
+import os
 
 from composio_swe.config.constants import KEY_API_KEY
 from composio_swe.config.context import Context, set_context
 from composio_swe.config.store import IssueConfig
 from datasets import load_dataset
+from composio_swe.config.context import get_context
 from rich.logging import RichHandler
 
 from composio import Action, Composio
+from swe.swe_bench_docker.evaulate_on_docker import evaluate, EvaluateOnDockerArgs
 from composio.workspace.docker_workspace import LocalDockerArgumentsModel
 from composio.workspace.workspace_factory import WorkspaceFactory, WorkspaceType
 from swe.examples.crewai_agent import CrewaiAgent, SWEArgs
+from swe.benchmark.setup_test_bed import create_patches_file
+from swe.benchmark.get_score_card import generate_scorecard, MODEL_GPT4
 
 
 # get logger
 LOGGER_NAME = "local_workspace"
+DATASET_NAME = "princeton-nlp/SWE-bench_Lite"
+PATH_TESTBED = "testbed/"
 
 handler = RichHandler(show_time=False, show_path=False)
 handler.setLevel(logging.DEBUG)
@@ -27,36 +36,37 @@
 logger.propagate = False
 
 
-# princeton swe bench lite dataset has these fields
-# instance_id: (str) - A formatted instance identifier, usually as repo_owner__repo_name-PR-number.
-# patch: (str) - The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue.
-# repo: (str) - The repository owner/name identifier from GitHub.
-# base_commit: (str) - The commit hash of the repository representing the HEAD of the repository before the solution PR is applied.
-# hints_text: (str) - Comments made on the issue prior to the creation of the solution PR's first commit creation date.
-# created_at: (str) - The creation date of the pull request.
-# test_patch: (str) - A test-file patch that was contributed by the solution PR.
-# problem_statement: (str) - The issue title and body.
-# version: (str) - Installation version to use for running evaluation.
-# environment_setup_commit: (str) - commit hash to use for environment setup and installation.
-# FAIL_TO_PASS: (str) - A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution.
-# PASS_TO_PASS: (str) - A json list of strings that represent tests that should pass before and after the PR application.
-
-
-def filter_from_repo_name(curr_dataset, repo_name):
-    filtered_dataset = curr_dataset.filter(
-        lambda x: x["repo"] == repo_name.strip().lower()
-    )
-    return filtered_dataset
-
-
 def get_issues_dataset(test_split):
     test_dataset = load_dataset(
-        "princeton-nlp/SWE-bench_Lite",
+        DATASET_NAME,
         split=f"test[{test_split}]",
     )
     return test_dataset
 
 
+def get_score():
+    ctx = get_context()
+    prediction_patches_path = create_patches_file(ctx.agent_logs_dir, DATASET_NAME)
+    evaluate_args = EvaluateOnDockerArgs(
+        predictions_path=prediction_patches_path,
+        docker_dir="./docker",
+        swe_bench_tasks=DATASET_NAME,
+        namespace="aorwall",
+        log_dir=ctx.agent_logs_dir+"/logs"
+    )
+    asyncio.run(evaluate(**evaluate_args.dict()))
+    prediction_path_dir = Path(args.prediction_path_dir)
+    testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
+    if not os.path.exists(testbed_dir):
+        os.makedirs(testbed_dir)
+    generate_scorecard(
+        predictions_dir=prediction_path_dir,
+        log_dir=str(args.log_dir),
+        swe_bench_path=args.swe_bench_path,
+        model=MODEL_GPT4,
+    )
+
+
 def build_issue_description(hints, problem_statement, include_hints):
     if not problem_statement or not problem_statement.strip():
         raise ValueError("problem statement is empty")
@@ -256,7 +266,17 @@ def run(test_split, print_only=False, include_hints=True):
         action="store_true",
         help="Include hints in the issue description",
     )
+    parser.add_argument(
+        "--gen_report",
+        action="store_true",
+        default=False,
+        help="Generate a report after running evaluations",
+    )
+
     args = parser.parse_args()
 
     print("Starting evaluation")
     run(args.test_split, args.print_only, args.include_hints)
+    if args.gen_report:
+        get_score()
+
diff --git a/python/swe/benchmark/setup_test_bed.py b/python/swe/benchmark/setup_test_bed.py
@@ -62,7 +62,7 @@ def log_file(f_name):
     return False
 
 
-def main(predictions_dir, dataset_path_or_name):
+def create_patches_file(predictions_dir, dataset_path_or_name):
     all_patches = []
     pred_total, pred_will_eval = 0, 0
     download_and_store_dataset(
@@ -109,6 +109,7 @@ def main(predictions_dir, dataset_path_or_name):
     print(
         f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
     )
+    return pred_path_orig
 
 
 if __name__ == "__main__":
@@ -132,7 +133,7 @@ def main(predictions_dir, dataset_path_or_name):
     script_path = Path(__file__)
     script_dir = script_path.parent
     prediction_path_dir = Path(args.prediction_path_dir)
-    main(
+    create_patches_file(
         predictions_dir=prediction_path_dir,
         dataset_path_or_name=args.dataset_path_or_name,
     )
diff --git a/python/swe/swe_bench_docker/evaulate_on_docker.py b/python/swe/swe_bench_docker/evaulate_on_docker.py
@@ -56,7 +56,7 @@ async def run_docker_throttled(task_instance, namespace, log_dir, timeout, log_s
         return await run_docker_evaluation(task_instance, namespace, log_dir, timeout, log_suffix)
 
 
-async def main(
+async def evaluate(
     predictions_path: str,
     swe_bench_tasks: str,
     namespace: str,
@@ -181,4 +181,4 @@ class EvaluateOnDockerArgs(BaseModel):
         namespace="aorwall",
         log_dir="~/.composio_coder/logs/logs/"
     )
-    asyncio.run(main(**args.dict()))
+    asyncio.run(evaluate(**args.dict()))