Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TestGenEval benchmark #5534

Open
wants to merge 46 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
7a4729c
initial TestGenEval code
Nov 29, 2024
c6206f5
Initial pass for TestGenEval
Nov 29, 2024
280baa2
Licensing
Nov 29, 2024
75fba59
Readability metrics
Nov 29, 2024
f7f2531
Fixing testing dependencies
Dec 4, 2024
30197e6
Add option for starting point
Dec 4, 2024
791b7f9
Cleaning to not OOM
Dec 5, 2024
bd66d09
Merge pull request #1 from All-Hands-AI/main
kjain14 Dec 6, 2024
585dba9
TestGenEval MVP
Dec 11, 2024
3af6025
+ mutation testing
Dec 11, 2024
b19f735
Merge pull request #2 from All-Hands-AI/main
kjain14 Dec 11, 2024
7c81deb
Update README
Dec 11, 2024
2cd64bc
Merge branch 'main' of https://github.com/kjain14/OpenHands
Dec 11, 2024
b685c67
reset
Dec 12, 2024
fb9bc87
testgeneval deps
Dec 12, 2024
77a153e
Final update, now working on all projects
Dec 16, 2024
3401bd6
Update TestGenEval README with comprehensive information
openhands-agent Dec 25, 2024
b47da9e
Merge branch 'main' of github.com:All-Hands-AI/OpenHands into kjain14…
neubig Dec 25, 2024
90422e5
Update lock file
neubig Dec 25, 2024
31b6967
Any and all pass
Jan 8, 2025
1ded123
Reset to normal time
Jan 8, 2025
efb525a
Refine postprocessing
Jan 9, 2025
219a134
Refine prompt
Jan 10, 2025
3f0f13d
Update prompt
Jan 10, 2025
d1e8409
Update filtering
Jan 17, 2025
8848e60
Only top level filtering
Jan 17, 2025
3355bae
Merge branch 'main' of github.com:kjain14/OpenHands into kjain14-main
neubig Jan 20, 2025
9f9a65c
More updates
Jan 20, 2025
f781bc8
Fix prompting
Jan 28, 2025
c7d575b
Removing duplicate script
Jan 28, 2025
64abd4a
Ablation outputs
Jan 30, 2025
e7a8daf
Fixing code to handle ablations
Feb 4, 2025
eef0ed3
Final prompt for final experiments
Feb 6, 2025
8782e3a
Merge pull request #3 from All-Hands-AI/main
kjain14 Feb 6, 2025
d8ad8ba
Merge branch 'main' into main
neubig Feb 8, 2025
326e75e
Remove prompt truncation
neubig Feb 9, 2025
4471002
Remove unneeded input
neubig Feb 9, 2025
fd53378
Rename eval-infer
neubig Feb 10, 2025
513dd97
Restore testgeneval poetry group
openhands-agent Feb 10, 2025
1290a25
Update lock
neubig Feb 10, 2025
ace9e6e
Update TestGenEval README to include dependency installation
openhands-agent Feb 10, 2025
eb36426
Adding so file for codebleu
kjain14 Feb 11, 2025
09335d6
Merging
Feb 16, 2025
367c8a9
Merging
Feb 16, 2025
92ddc1b
Merge branch 'All-Hands-AI-main'
Feb 16, 2025
4984bf6
Merge pull request #6 from All-Hands-AI/main
kjain14 Feb 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
+ mutation testing
Kush Dave Jain committed Dec 11, 2024
commit 3af60253039a4f22fba6fe6c4c2d1d3fe5a8baca
2,410 changes: 1,214 additions & 1,196 deletions evaluation/benchmarks/testgeneval/constants.py

Large diffs are not rendered by default.

35 changes: 30 additions & 5 deletions evaluation/benchmarks/testgeneval/eval_infer.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,8 @@
from evaluation.benchmarks.testgeneval.compute_readability import compute_readability
from evaluation.benchmarks.testgeneval.constants import (
COVERAGE_PREFIX,
MUTATION_BUFFER,
MUTATION_TEMPLATE,
MUTATION_TIMEOUT,
TESTS_FAILED,
TESTS_SUFFIX,
@@ -32,7 +34,9 @@
TestSpec,
make_test_spec,
)
from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
from evaluation.benchmarks.testgeneval.utils import (
load_testgeneval_dataset,
)
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
@@ -146,7 +150,7 @@ def run_tests(runtime, instance, test_script, log_file='/tmp/test_output.log'):
test_action.timeout = 300
test_obs = runtime.run_action(test_action)
assert isinstance(test_obs, CmdOutputObservation), 'Failed to retrieve test output.'
return test_obs.exit_code, test_obs.content
return test_obs.exit_code, test_obs.content, elapsed_time


def run_mutation_testing(
@@ -304,7 +308,7 @@ def process_instance(
run_command(runtime, 'chmod +x /tmp/test.sh /tmp/mutation.sh')
run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}')

_, test_output = run_tests(runtime, instance, '/tmp/test.sh')
_, test_output, test_time = run_tests(runtime, instance, '/tmp/test.sh')
instance['test_result']['report']['test_output'] = test_output
if TESTS_FAILED not in test_output:
coverage_success, coverage, unit_test_output, coverage_output = (
@@ -321,6 +325,21 @@ def process_instance(
)

if not args.skip_mutation and coverage_success:
mutation_timeout = max(10, 1.5 * test_time)
mutation_toml = MUTATION_TEMPLATE.format(
test_cmd=test_spec.test_cmd,
source_fp=test_spec.code_file,
timeout=mutation_timeout,
)

with tempfile.TemporaryDirectory() as temp_dir:
mutation_toml_path = os.path.join(temp_dir, 'mutation.toml')
with open(mutation_toml_path, 'w') as f:
f.write(mutation_toml)
runtime.copy_to(mutation_toml_path, '/tmp')

run_command(runtime, 'cp /tmp/mutation.toml /testbed/mutation.toml')

mutation_code, mutation_output = run_mutation_testing(
runtime, instance, '/tmp/mutation.sh'
)
@@ -348,7 +367,7 @@ def process_instance(
logger.error(f'Error processing instance {instance.id}: {e}')
raise RuntimeError(
instance.id,
f'Unexpected output when running test suite:\n{test_suite[:1000]}...',
'Unexpected output...',
logger,
)

@@ -413,6 +432,12 @@ def count_field(row, field):
default=MUTATION_TIMEOUT,
help='Mutation timeout',
)
parser.add_argument(
'--mutation_buffer',
type=int,
default=MUTATION_BUFFER,
help='Mutation buffer',
)
args, _ = parser.parse_known_args()

dataset: list[TestGenEvalInstance] = load_testgeneval_dataset(
@@ -476,7 +501,7 @@ def count_field(row, field):
), 'Input file must contain id, instance_id and test_suite columns.'

predictions['test_spec'] = predictions['instance'].apply(
lambda x: make_test_spec(x, args.mutation_timeout)
lambda x: make_test_spec(x, args.mutation_timeout, args.mutation_buffer)
)

output_file = args.input_file.replace('.jsonl', '.testgeneval.jsonl')
30 changes: 17 additions & 13 deletions evaluation/benchmarks/testgeneval/report_utils.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
import re
import json
import re


def check_coverage(coverage_output, code_file):
json_cov = json.loads(coverage_output)
if code_file in json_cov["files"].keys():
file_data = json_cov["files"][code_file]
return True, file_data["summary"]["percent_covered"]
if code_file in json_cov['files'].keys():
file_data = json_cov['files'][code_file]
return True, file_data['summary']['percent_covered']

return False, 0


def check_mutation(mutation_output):
if "total jobs: " in mutation_output:
num_mutants = int(mutation_output.split("total jobs: ")[1].split("\n")[0])
final_conf = mutation_output.split("\n")[-1]
if len(final_conf.strip().split(" ")) == 3:
low, val, high = final_conf.split(" ")
print(mutation_output)
input()
if 'total jobs: ' in mutation_output:
num_mutants = int(mutation_output.split('total jobs: ')[1].split('\n')[0])
final_conf = mutation_output.split('\n')[-1]
if len(final_conf.strip().split(' ')) == 3:
low, val, high = final_conf.split(' ')
low = float(low)
val = float(val)
high = float(high)
@@ -24,8 +28,9 @@ def check_mutation(mutation_output):

return True, num_mutants, mutation_score, confidence_range


return False, -1, 0, -1


def count_methods(code_str):
"""
Counts the number of methods/functions in a given string of code.
@@ -37,7 +42,7 @@ def count_methods(code_str):
int: The number of methods/functions found.
"""
# Regular expression to find Python function definitions
pattern = r"\bdef\b\s+\w+\s*\("
pattern = r'\bdef\b\s+\w+\s*\('
matches = re.findall(pattern, code_str)
return len(matches)

@@ -52,5 +57,4 @@ def get_lines_of_code(code_str):
Returns:
list: A list of lines of code.
"""
return len(code_str.strip().split("\n"))

return len(code_str.strip().split('\n'))
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ INPUT_FILE=$1
NUM_WORKERS=$2
DATASET=$3
SPLIT=$4
SKIP_MUTATION=$5

if [ -z "$INPUT_FILE" ]; then
echo "INPUT_FILE not specified (should be a path to a jsonl file)"
@@ -32,9 +33,13 @@ COMMAND="poetry run python evaluation/benchmarks/testgeneval/eval_infer.py \
--eval-num-workers $NUM_WORKERS \
--input-file $INPUT_FILE \
--dataset $DATASET \
--skip_mutation \
--split $SPLIT"

if [ "$SKIP_MUTATION" == "true" ]; then
echo "Skipping mutation evaluation"
COMMAND="$COMMAND --skip-mutation"
fi

if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
35 changes: 20 additions & 15 deletions evaluation/benchmarks/testgeneval/test_spec.py
Original file line number Diff line number Diff line change
@@ -28,6 +28,7 @@ class TestSpec:
id: str
repo: str
version: str
test_cmd: str
code_file: str
test_file: str
baseline_covs: dict
@@ -83,23 +84,15 @@ def make_test_setup(specs, env_name, repo_directory, includes_tox=False):
return eval_commands


def make_test_script_list(instance, specs, env_name, repo_directory):
def make_test_script_list(test_cmd, specs, env_name, repo_directory):
"""
Runs the tests.
"""
test_command = ' '.join(
[
MAP_REPO_VERSION_TO_SPECS[instance['repo']][instance['version']][
'test_cmd'
],
*get_test_directives(instance),
]
)

includes_tox = 'tox' in test_command
includes_tox = 'tox' in test_cmd
eval_commands = make_test_setup(specs, env_name, repo_directory, includes_tox)
eval_commands += [
f'{test_command} || {{ echo "{TESTS_FAILED}" && exit 1; }}',
f'{test_cmd} || {{ echo "{TESTS_FAILED}" && exit 1; }}',
f'echo "{TESTS_SUFFIX}"\n',
'coverage json -o coverage.json',
f'echo "{COVERAGE_PREFIX}"\n',
@@ -118,13 +111,15 @@ def make_mutation_script_list(specs, env_name, repo_directory, mutation_timeout)
eval_commands += [
'cosmic-ray init mutation.toml mutation.sqlite',
f'timeout {mutation_timeout}s cosmic-ray exec mutation.toml mutation.sqlite',
'cr-rate mutation.sqlite --estimate --confidence 95.0',
'cr-report mutation.sqlite',
'cr-rate mutation.sqlite --estimate --confidence 95.0',
]
return eval_commands


def make_test_spec(instance: TestGenEvalInstance, mutation_timeout: int) -> TestSpec:
def make_test_spec(
instance: TestGenEvalInstance, mutation_timeout: int, buffer: int
) -> TestSpec:
if isinstance(instance, TestSpec):
return instance
instance_id = instance[KEY_INSTANCE_ID]
@@ -139,17 +134,27 @@ def make_test_spec(instance: TestGenEvalInstance, mutation_timeout: int) -> Test
repo_directory = f'/{env_name}'
specs = MAP_REPO_VERSION_TO_SPECS[repo][version]

test_script_list = make_test_script_list(instance, specs, env_name, repo_directory)
test_cmd = ' '.join(
[
MAP_REPO_VERSION_TO_SPECS[instance['repo']][instance['version']][
'test_cmd'
],
*get_test_directives(instance),
]
)

test_script_list = make_test_script_list(test_cmd, specs, env_name, repo_directory)

mutation_script_list = make_mutation_script_list(
specs, env_name, repo_directory, mutation_timeout
specs, env_name, repo_directory, mutation_timeout - buffer
)

return TestSpec(
instance_id=instance_id,
id=id,
repo=repo,
test_script_list=test_script_list,
test_cmd=test_cmd,
mutation_script_list=mutation_script_list,
code_file=code_file,
test_file=test_file,