Skip to content

Commit

Permalink
add generate report card
Browse files Browse the repository at this point in the history
  • Loading branch information
shubhras01 committed Jul 2, 2024
1 parent 54854cd commit c1372a9
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 29 deletions.
4 changes: 2 additions & 2 deletions python/swe/benchmark/get_score_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def save_summaries_to_file(predictions_dir, predictions_path, log_dir, scorecard
logging.info("- Wrote summary of run to: %s", results_path)


def main(predictions_dir, log_dir, swe_bench_path, model):
def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
logging.info("Starting main function")
eval_refs, _ = get_cur_eval_refs(predictions_dir, swe_bench_path)
predictions_path = predictions_dir / Path(PATH_PATCHES_JSON)
Expand Down Expand Up @@ -201,7 +201,7 @@ def main(predictions_dir, log_dir, swe_bench_path, model):
testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
if not os.path.exists(testbed_dir):
os.makedirs(testbed_dir)
main(
generate_scorecard(
predictions_dir=prediction_path_dir,
log_dir=str(args.log_dir),
swe_bench_path=args.swe_bench_path,
Expand Down
66 changes: 43 additions & 23 deletions python/swe/benchmark/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,30 @@
import argparse
import datetime
import logging
import asyncio
from pathlib import Path
import os

from composio_swe.config.constants import KEY_API_KEY
from composio_swe.config.context import Context, set_context
from composio_swe.config.store import IssueConfig
from datasets import load_dataset
from composio_swe.config.context import get_context
from rich.logging import RichHandler

from composio import Action, Composio
from swe.swe_bench_docker.evaulate_on_docker import evaluate, EvaluateOnDockerArgs
from composio.workspace.docker_workspace import LocalDockerArgumentsModel
from composio.workspace.workspace_factory import WorkspaceFactory, WorkspaceType
from swe.examples.crewai_agent import CrewaiAgent, SWEArgs
from swe.benchmark.setup_test_bed import create_patches_file
from swe.benchmark.get_score_card import generate_scorecard, MODEL_GPT4


# get logger
LOGGER_NAME = "local_workspace"
DATASET_NAME = "princeton-nlp/SWE-bench_Lite"
PATH_TESTBED = "testbed/"

handler = RichHandler(show_time=False, show_path=False)
handler.setLevel(logging.DEBUG)
Expand All @@ -27,36 +36,37 @@
logger.propagate = False


# princeton swe bench lite dataset has these fields
# instance_id: (str) - A formatted instance identifier, usually as repo_owner__repo_name-PR-number.
# patch: (str) - The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue.
# repo: (str) - The repository owner/name identifier from GitHub.
# base_commit: (str) - The commit hash of the repository representing the HEAD of the repository before the solution PR is applied.
# hints_text: (str) - Comments made on the issue prior to the creation of the solution PR's first commit creation date.
# created_at: (str) - The creation date of the pull request.
# test_patch: (str) - A test-file patch that was contributed by the solution PR.
# problem_statement: (str) - The issue title and body.
# version: (str) - Installation version to use for running evaluation.
# environment_setup_commit: (str) - commit hash to use for environment setup and installation.
# FAIL_TO_PASS: (str) - A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution.
# PASS_TO_PASS: (str) - A json list of strings that represent tests that should pass before and after the PR application.


def filter_from_repo_name(curr_dataset, repo_name):
filtered_dataset = curr_dataset.filter(
lambda x: x["repo"] == repo_name.strip().lower()
)
return filtered_dataset


def get_issues_dataset(test_split):
test_dataset = load_dataset(
"princeton-nlp/SWE-bench_Lite",
DATASET_NAME,
split=f"test[{test_split}]",
)
return test_dataset


def get_score():
ctx = get_context()
prediction_patches_path = create_patches_file(ctx.agent_logs_dir, DATASET_NAME)
evaluate_args = EvaluateOnDockerArgs(
predictions_path=prediction_patches_path,
docker_dir="./docker",
swe_bench_tasks=DATASET_NAME,
namespace="aorwall",
log_dir=ctx.agent_logs_dir+"/logs"
)
asyncio.run(evaluate(**evaluate_args.dict()))
prediction_path_dir = Path(args.prediction_path_dir)
testbed_dir = prediction_path_dir / Path(PATH_TESTBED)
if not os.path.exists(testbed_dir):
os.makedirs(testbed_dir)
generate_scorecard(
predictions_dir=prediction_path_dir,
log_dir=str(args.log_dir),
swe_bench_path=args.swe_bench_path,
model=MODEL_GPT4,
)


def build_issue_description(hints, problem_statement, include_hints):
if not problem_statement or not problem_statement.strip():
raise ValueError("problem statement is empty")
Expand Down Expand Up @@ -256,7 +266,17 @@ def run(test_split, print_only=False, include_hints=True):
action="store_true",
help="Include hints in the issue description",
)
parser.add_argument(
"--gen_report",
action="store_true",
default=False,
help="Generate a report after running evaluations",
)

args = parser.parse_args()

print("Starting evaluation")
run(args.test_split, args.print_only, args.include_hints)
if args.gen_report:
get_score()

5 changes: 3 additions & 2 deletions python/swe/benchmark/setup_test_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def log_file(f_name):
return False


def main(predictions_dir, dataset_path_or_name):
def create_patches_file(predictions_dir, dataset_path_or_name):
all_patches = []
pred_total, pred_will_eval = 0, 0
download_and_store_dataset(
Expand Down Expand Up @@ -109,6 +109,7 @@ def main(predictions_dir, dataset_path_or_name):
print(
f"Found {pred_total} total predictions, will evaluate {pred_will_eval} ({pred_total-pred_will_eval} are empty)"
)
return pred_path_orig


if __name__ == "__main__":
Expand All @@ -132,7 +133,7 @@ def main(predictions_dir, dataset_path_or_name):
script_path = Path(__file__)
script_dir = script_path.parent
prediction_path_dir = Path(args.prediction_path_dir)
main(
create_patches_file(
predictions_dir=prediction_path_dir,
dataset_path_or_name=args.dataset_path_or_name,
)
4 changes: 2 additions & 2 deletions python/swe/swe_bench_docker/evaulate_on_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ async def run_docker_throttled(task_instance, namespace, log_dir, timeout, log_s
return await run_docker_evaluation(task_instance, namespace, log_dir, timeout, log_suffix)


async def main(
async def evaluate(
predictions_path: str,
swe_bench_tasks: str,
namespace: str,
Expand Down Expand Up @@ -181,4 +181,4 @@ class EvaluateOnDockerArgs(BaseModel):
namespace="aorwall",
log_dir="~/.composio_coder/logs/logs/"
)
asyncio.run(main(**args.dict()))
asyncio.run(evaluate(**args.dict()))

0 comments on commit c1372a9

Please sign in to comment.