+ mutation testing

All-Hands-AI · kjain14 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024
commit 3af60253039a4f22fba6fe6c4c2d1d3fe5a8baca
diff --git a/evaluation/benchmarks/testgeneval/constants.py b/evaluation/benchmarks/testgeneval/constants.py
diff --git a/evaluation/benchmarks/testgeneval/eval_infer.py b/evaluation/benchmarks/testgeneval/eval_infer.py
@@ -14,6 +14,8 @@
 from evaluation.benchmarks.testgeneval.compute_readability import compute_readability
 from evaluation.benchmarks.testgeneval.constants import (
     COVERAGE_PREFIX,
+    MUTATION_BUFFER,
+    MUTATION_TEMPLATE,
     MUTATION_TIMEOUT,
     TESTS_FAILED,
     TESTS_SUFFIX,
@@ -32,7 +34,9 @@
     TestSpec,
     make_test_spec,
 )
-from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset
+from evaluation.benchmarks.testgeneval.utils import (
+    load_testgeneval_dataset,
+)
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
@@ -146,7 +150,7 @@ def run_tests(runtime, instance, test_script, log_file='/tmp/test_output.log'):
     test_action.timeout = 300
     test_obs = runtime.run_action(test_action)
     assert isinstance(test_obs, CmdOutputObservation), 'Failed to retrieve test output.'
-    return test_obs.exit_code, test_obs.content
+    return test_obs.exit_code, test_obs.content, elapsed_time
 
 
 def run_mutation_testing(
@@ -304,7 +308,7 @@ def process_instance(
         run_command(runtime, 'chmod +x /tmp/test.sh /tmp/mutation.sh')
         run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}')
 
-        _, test_output = run_tests(runtime, instance, '/tmp/test.sh')
+        _, test_output, test_time = run_tests(runtime, instance, '/tmp/test.sh')
         instance['test_result']['report']['test_output'] = test_output
         if TESTS_FAILED not in test_output:
             coverage_success, coverage, unit_test_output, coverage_output = (
@@ -321,6 +325,21 @@ def process_instance(
             )
 
             if not args.skip_mutation and coverage_success:
+                mutation_timeout = max(10, 1.5 * test_time)
+                mutation_toml = MUTATION_TEMPLATE.format(
+                    test_cmd=test_spec.test_cmd,
+                    source_fp=test_spec.code_file,
+                    timeout=mutation_timeout,
+                )
+
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    mutation_toml_path = os.path.join(temp_dir, 'mutation.toml')
+                    with open(mutation_toml_path, 'w') as f:
+                        f.write(mutation_toml)
+                    runtime.copy_to(mutation_toml_path, '/tmp')
+
+                run_command(runtime, 'cp /tmp/mutation.toml /testbed/mutation.toml')
+
                 mutation_code, mutation_output = run_mutation_testing(
                     runtime, instance, '/tmp/mutation.sh'
                 )
@@ -348,7 +367,7 @@ def process_instance(
         logger.error(f'Error processing instance {instance.id}: {e}')
         raise RuntimeError(
             instance.id,
-            f'Unexpected output when running test suite:\n{test_suite[:1000]}...',
+            'Unexpected output...',
             logger,
         )
 
@@ -413,6 +432,12 @@ def count_field(row, field):
         default=MUTATION_TIMEOUT,
         help='Mutation timeout',
     )
+    parser.add_argument(
+        '--mutation_buffer',
+        type=int,
+        default=MUTATION_BUFFER,
+        help='Mutation buffer',
+    )
     args, _ = parser.parse_known_args()
 
     dataset: list[TestGenEvalInstance] = load_testgeneval_dataset(
@@ -476,7 +501,7 @@ def count_field(row, field):
     ), 'Input file must contain id, instance_id and test_suite columns.'
 
     predictions['test_spec'] = predictions['instance'].apply(
-        lambda x: make_test_spec(x, args.mutation_timeout)
+        lambda x: make_test_spec(x, args.mutation_timeout, args.mutation_buffer)
     )
 
     output_file = args.input_file.replace('.jsonl', '.testgeneval.jsonl')

diff --git a/evaluation/benchmarks/testgeneval/report_utils.py b/evaluation/benchmarks/testgeneval/report_utils.py
@@ -1,20 +1,24 @@
-import re
 import json
+import re
+
 
 def check_coverage(coverage_output, code_file):
     json_cov = json.loads(coverage_output)
-    if code_file in json_cov["files"].keys():
-        file_data = json_cov["files"][code_file]
-        return True, file_data["summary"]["percent_covered"]
+    if code_file in json_cov['files'].keys():
+        file_data = json_cov['files'][code_file]
+        return True, file_data['summary']['percent_covered']
 
     return False, 0
 
+
 def check_mutation(mutation_output):
-    if "total jobs: " in mutation_output:
-        num_mutants = int(mutation_output.split("total jobs: ")[1].split("\n")[0])
-        final_conf = mutation_output.split("\n")[-1]
-        if len(final_conf.strip().split(" ")) == 3:
-            low, val, high = final_conf.split(" ")
+    print(mutation_output)
+    input()
+    if 'total jobs: ' in mutation_output:
+        num_mutants = int(mutation_output.split('total jobs: ')[1].split('\n')[0])
+        final_conf = mutation_output.split('\n')[-1]
+        if len(final_conf.strip().split(' ')) == 3:
+            low, val, high = final_conf.split(' ')
             low = float(low)
             val = float(val)
             high = float(high)
@@ -24,8 +28,9 @@ def check_mutation(mutation_output):
 
             return True, num_mutants, mutation_score, confidence_range
 
-
     return False, -1, 0, -1
+
+
 def count_methods(code_str):
     """
     Counts the number of methods/functions in a given string of code.
@@ -37,7 +42,7 @@ def count_methods(code_str):
     int: The number of methods/functions found.
     """
     # Regular expression to find Python function definitions
-    pattern = r"\bdef\b\s+\w+\s*\("
+    pattern = r'\bdef\b\s+\w+\s*\('
     matches = re.findall(pattern, code_str)
     return len(matches)
 
@@ -52,5 +57,4 @@ def get_lines_of_code(code_str):
     Returns:
     list: A list of lines of code.
     """
-    return len(code_str.strip().split("\n"))
-
+    return len(code_str.strip().split('\n'))
diff --git a/evaluation/benchmarks/testgeneval/scripts/eval_infer_remote.sh b/evaluation/benchmarks/testgeneval/scripts/eval_infer_remote.sh
@@ -5,6 +5,7 @@ INPUT_FILE=$1
 NUM_WORKERS=$2
 DATASET=$3
 SPLIT=$4
+SKIP_MUTATION=$5
 
 if [ -z "$INPUT_FILE" ]; then
   echo "INPUT_FILE not specified (should be a path to a jsonl file)"
@@ -32,9 +33,13 @@ COMMAND="poetry run python evaluation/benchmarks/testgeneval/eval_infer.py \
   --eval-num-workers $NUM_WORKERS \
   --input-file $INPUT_FILE \
   --dataset $DATASET \
-  --skip_mutation \
   --split $SPLIT"
 
+if [ "$SKIP_MUTATION" == "true" ]; then
+  echo "Skipping mutation evaluation"
+  COMMAND="$COMMAND --skip-mutation"
+fi
+
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"

diff --git a/evaluation/benchmarks/testgeneval/test_spec.py b/evaluation/benchmarks/testgeneval/test_spec.py
@@ -28,6 +28,7 @@ class TestSpec:
     id: str
     repo: str
     version: str
+    test_cmd: str
     code_file: str
     test_file: str
     baseline_covs: dict
@@ -83,23 +84,15 @@ def make_test_setup(specs, env_name, repo_directory, includes_tox=False):
     return eval_commands
 
 
-def make_test_script_list(instance, specs, env_name, repo_directory):
+def make_test_script_list(test_cmd, specs, env_name, repo_directory):
     """
     Runs the tests.
     """
-    test_command = ' '.join(
-        [
-            MAP_REPO_VERSION_TO_SPECS[instance['repo']][instance['version']][
-                'test_cmd'
-            ],
-            *get_test_directives(instance),
-        ]
-    )
 
-    includes_tox = 'tox' in test_command
+    includes_tox = 'tox' in test_cmd
     eval_commands = make_test_setup(specs, env_name, repo_directory, includes_tox)
     eval_commands += [
-        f'{test_command} || {{ echo "{TESTS_FAILED}" && exit 1; }}',
+        f'{test_cmd} || {{ echo "{TESTS_FAILED}" && exit 1; }}',
         f'echo "{TESTS_SUFFIX}"\n',
         'coverage json -o coverage.json',
         f'echo "{COVERAGE_PREFIX}"\n',
@@ -118,13 +111,15 @@ def make_mutation_script_list(specs, env_name, repo_directory, mutation_timeout)
     eval_commands += [
         'cosmic-ray init mutation.toml mutation.sqlite',
         f'timeout {mutation_timeout}s cosmic-ray exec mutation.toml mutation.sqlite',
-        'cr-rate mutation.sqlite  --estimate --confidence 95.0',
         'cr-report mutation.sqlite',
+        'cr-rate mutation.sqlite  --estimate --confidence 95.0',
     ]
     return eval_commands
 
 
-def make_test_spec(instance: TestGenEvalInstance, mutation_timeout: int) -> TestSpec:
+def make_test_spec(
+    instance: TestGenEvalInstance, mutation_timeout: int, buffer: int
+) -> TestSpec:
     if isinstance(instance, TestSpec):
         return instance
     instance_id = instance[KEY_INSTANCE_ID]
@@ -139,17 +134,27 @@ def make_test_spec(instance: TestGenEvalInstance, mutation_timeout: int) -> Test
     repo_directory = f'/{env_name}'
     specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
 
-    test_script_list = make_test_script_list(instance, specs, env_name, repo_directory)
+    test_cmd = ' '.join(
+        [
+            MAP_REPO_VERSION_TO_SPECS[instance['repo']][instance['version']][
+                'test_cmd'
+            ],
+            *get_test_directives(instance),
+        ]
+    )
+
+    test_script_list = make_test_script_list(test_cmd, specs, env_name, repo_directory)
 
     mutation_script_list = make_mutation_script_list(
-        specs, env_name, repo_directory, mutation_timeout
+        specs, env_name, repo_directory, mutation_timeout - buffer
     )
 
     return TestSpec(
         instance_id=instance_id,
         id=id,
         repo=repo,
         test_script_list=test_script_list,
+        test_cmd=test_cmd,
         mutation_script_list=mutation_script_list,
         code_file=code_file,
         test_file=test_file,