Final prompt for final experiments

All-Hands-AI · kjain14 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024
commit eef0ed3410d4edc00e4d16880e28dd22ededf52e
diff --git a/evaluation/benchmarks/testgeneval/NOTES.md b/evaluation/benchmarks/testgeneval/NOTES.md
@@ -1,3 +1,12 @@
 codamosa_ids = ['pydata__xarray-4750-16496', 'pydata__xarray-3239-16458', 'pydata__xarray-4966-16515', 'pydata__xarray-3302-16459', 'pydata__xarray-5126-16518', 'pydata__xarray-4994-16516', 'pydata__xarray-3905-16478', 'pydata__xarray-4182-16484', 'pydata__xarray-5131-16520', 'pydata__xarray-5662-16532', 'pydata__xarray-3364-16461', 'pydata__xarray-5731-16534', 'pydata__xarray-3239-16457', 'pydata__xarray-7203-16577', 'pydata__xarray-3156-16454', 'pydata__xarray-5126-16519', 'pydata__xarray-5365-16529', 'pydata__xarray-4629-16492', 'pydata__xarray-4248-16486', 'pydata__xarray-4339-16487', 'pydata__xarray-3151-16453', 'pydata__xarray-3114-16452', 'pydata__xarray-5033-16517', 'pydata__xarray-4802-16505', 'pydata__xarray-5455-16530', 'pydata__xarray-6400-16539', 'pydata__xarray-3239-16456', 'pydata__xarray-4419-16488']
 
 pynguin_ids = ['pydata__xarray-6548-16541', 'pydata__xarray-7003-16557', 'pydata__xarray-3114-16452', 'pydata__xarray-4339-16487', 'pydata__xarray-6889-16549', 'pydata__xarray-3239-16458', 'pydata__xarray-3364-16461', 'pydata__xarray-3239-16457', 'pydata__xarray-5365-16529', 'pydata__xarray-5131-16520', 'pydata__xarray-7229-16578', 'pydata__xarray-6461-16540', 'pydata__xarray-4419-16488', 'pydata__xarray-7147-16571', 'pydata__xarray-3151-16453', 'pydata__xarray-4966-16515', 'pydata__xarray-4629-16492', 'pydata__xarray-3239-16456', 'pydata__xarray-7400-16582', 'pydata__xarray-4994-16516', 'pydata__xarray-3302-16459', 'pydata__xarray-6601-16544', 'pydata__xarray-6882-16548', 'pydata__xarray-6135-16535', 'pydata__xarray-7393-16581', 'pydata__xarray-5731-16534', 'pydata__xarray-7203-16577']
+
+ids = ['pydata__xarray-3114-16452', 'pydata__xarray-3151-16453', 'pydata__xarray-3156-16454', 'pydata__xarray-3239-16456', 'pydata__xarray-3239-16457', 'pydata__xarray-3239-16458', 'pydata__xarray-3302-16459', 'pydata__xarray-3364-16461', 'pydata__xarray-3677-16471', 'pydata__xarray-3905-16478', 'pydata__xarray-4182-16484', 'pydata__xarray-4248-16486', 'pydata__xarray-4339-16487', 'pydata__xarray-4419-16488', 'pydata__xarray-4629-16492', 'pydata__xarray-4750-16496', 'pydata__xarray-4802-16505', 'pydata__xarray-4966-16515', 'pydata__xarray-4994-16516', 'pydata__xarray-5033-16517', 'pydata__xarray-5126-16518', 'pydata__xarray-5126-16519', 'pydata__xarray-5131-16520', 'pydata__xarray-5365-16529', 'pydata__xarray-5455-16530', 'pydata__xarray-5662-16532', 'pydata__xarray-5731-16534', 'pydata__xarray-6135-16535', 'pydata__xarray-6135-16536', 'pydata__xarray-6386-16537', 'pydata__xarray-6394-16538', 'pydata__xarray-6400-16539', 'pydata__xarray-6461-16540', 'pydata__xarray-6548-16541', 'pydata__xarray-6599-16543', 'pydata__xarray-6601-16544', 'pydata__xarray-6882-16548', 'pydata__xarray-6889-16549', 'pydata__xarray-7003-16557', 'pydata__xarray-7147-16571', 'pydata__xarray-7150-16572', 'pydata__xarray-7203-16577', 'pydata__xarray-7229-16578', 'pydata__xarray-7393-16581', 'pydata__xarray-7400-16582']
+
+
+Command eval (our approach):
+poetry run ./evaluation/benchmarks/testgeneval/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/kjain14__testgeneval-test/CodeActAgent/gpt-4o_maxiter_25_N_v0.20.0-no-hint-run_1/output.jsonl 10 kjain14/testgeneval test true
+
+Command run (our approach):
+./evaluation/benchmarks/testgeneval/scripts/run_infer.sh llm.eval_gpt HEAD CodeActAgent -1 25 10 kjain14/testgeneval test 1 ../TestGenEval/results/testgeneval/preds/gpt-4o-2024-08-06__testgeneval__0.2__test.jsonl
diff --git a/evaluation/benchmarks/testgeneval/eval_infer.py b/evaluation/benchmarks/testgeneval/eval_infer.py
@@ -28,7 +28,7 @@
 )
 from evaluation.benchmarks.testgeneval.pygments_utils import tokenize_code
 from evaluation.benchmarks.testgeneval.run_infer import get_instance_docker_image
-from evaluation.benchmarks.testgeneval.test_filter import filter_passing_tests
+from evaluation.benchmarks.testgeneval.test_filter import filter_tests
 from evaluation.benchmarks.testgeneval.test_spec import (
     TestGenEvalInstance,
     TestSpec,
@@ -221,7 +221,7 @@ def grade_test_output(
         )
 
     logger.info('Calling filter unit tests')
-    filtered_content, passing_tests, failing_tests = filter_passing_tests(
+    filtered_content, passing_tests, failing_tests = filter_tests(
         test_suite, unit_test_output, test_spec.repo
     )
 

diff --git a/evaluation/benchmarks/testgeneval/prompt.py b/evaluation/benchmarks/testgeneval/prompt.py
@@ -74,7 +74,7 @@
 NOTE: if there is an error executing tests you MUST fix it before exiting. DO NOT install new packages.
 NOTE: if outputting a revised test suite REPLACE {test_file} with the revised suite
 
-**Output the final test suite** (20+ tests) for {test_file} in a single code block, no extra commentary.
+**Output the final test suite** (20+ tests) for {test_file} in a single code block, no extra commentary. MAKE SURE you run the tests and ensure you can see which tests passed and failed BEFORE exiting.
 """
 
 CODEACT_TESTGEN_PROMPT_ITERATE = """
@@ -110,5 +110,5 @@
 NOTE: if there is an error executing tests you MUST fix it before exiting. DO NOT install new packages.
 NOTE: if outputting a revised test suite REPLACE {test_file} with the revised suite
 
-**Output the final test suite** (20+ tests) for {test_file} in a single code block, no extra commentary.
+**Output the final test suite** (20+ tests) for {test_file} in a single code block, no extra commentary. MAKE SURE you run the tests and ensure you can see which tests passed and failed BEFORE exiting.
 """
diff --git a/evaluation/benchmarks/testgeneval/run_infer.py b/evaluation/benchmarks/testgeneval/run_infer.py
@@ -354,33 +354,37 @@ def complete_runtime(
     If you need to do something in the sandbox to get the correctness metric after
     the agent has run, modify this function.
     """
-    logger.info('-' * 30)
-    logger.info('BEGIN Runtime Completion Fn')
-    logger.info('-' * 30)
-    obs: CmdOutputObservation
-    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
-
-    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
-    )
+    try:
+        logger.info('-' * 30)
+        logger.info('BEGIN Runtime Completion Fn')
+        logger.info('-' * 30)
+        obs: CmdOutputObservation
+        workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+
+        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+        )
 
-    action = CmdRunAction(command=f'cat {instance.test_file}')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0,
-        f'Failed to find file: {instance.test_file} in /workspace/{workspace_dir_name}',
-    )
+        action = CmdRunAction(command=f'cat {instance.test_file}')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Failed to find file: {instance.test_file} in /workspace/{workspace_dir_name}',
+        )
 
-    test_suite = obs.content.strip()
+        test_suite = obs.content.strip()
+    except Exception:
+        print('Skipping, exeception in complete_runtime')
+        test_suite = instance['full_pred'] if instance['full_pred'] is not None else ''
 
     # action = CmdRunAction(command='git add -A')
     # action.set_hard_timeout(600)
@@ -471,7 +475,7 @@ def process_instance(
 
     # Save the output
     output = EvalOutput(
-        instance_id=instance.instance_id,
+        instance_id=instance.id,
         instruction=instruction,
         instance=_preprocess_instance(instance.to_dict()),  # SWE Bench specific
         test_result=test_result,
@@ -480,6 +484,8 @@ def process_instance(
         metrics=metrics,
         error=state.last_error if state and state.last_error else None,
     )
+    print(output)
+    input()
     return output
 
 

diff --git a/evaluation/benchmarks/testgeneval/test_filter.py b/evaluation/benchmarks/testgeneval/test_filter.py
@@ -1,3 +1,4 @@
+import ast
 import re
 from typing import List, Tuple
 
@@ -203,10 +204,10 @@ def filter_passing_tests(
             method_full_name = (
                 method_name.split('.')[-1].split('(')[0].strip().split(' ')[-1]
             )
-            # Check if the method name is in passing_tests or if any passing_test is in the method name
+            # Check if the method name is in failing_tests or if any failing_test is in the method name
             if not (
                 any(method_full_name in failing_test for failing_test in failing_tests)
-                and not any(
+                or any(
                     failing_test in method_full_name for failing_test in failing_tests
                 )
             ):
@@ -241,3 +242,84 @@ def filter_passing_tests(
         content_parts.append(func_body)
 
     return '\n\n'.join(content_parts), passing_tests, failing_tests
+
+
+def filter_tests(
+    test_content: str, test_output: str, repo: str
+) -> Tuple[str, List[str], List[str]]:
+    """
+    Filter tests using AST parsing to remove failing test functions from the test file.
+    Non-test functions (e.g. setup or helper methods) and classes (even if all test methods are failing)
+    are preserved.
+
+    If AST processing fails (for example, because the test file cannot be parsed),
+    this function falls back on the existing regex-based filtering (filter_passing_tests).
+
+    Returns:
+        Tuple containing:
+         - Modified test content (as a string) containing only passing tests.
+         - List of passing test names.
+         - List of failing test names.
+    """
+    try:
+        # Attempt to parse the test file using the AST.
+        tree = ast.parse(test_content)
+
+        # Parse test results using the appropriate parser.
+        parser = MAP_REPO_TO_PARSER.get(repo, parse_log_pytest)
+        test_results = parser(test_output)
+        passing_tests = [
+            name
+            for name, status in test_results.items()
+            if status == TestStatus.PASSED.value
+        ]
+        failing_tests = [
+            name
+            for name, status in test_results.items()
+            if status != TestStatus.PASSED.value
+        ]
+
+        # Helper function to decide if a test name should be considered failing.
+        def is_failing(name: str) -> bool:
+            for ft in failing_tests:
+                if name in ft or ft in name:
+                    return True
+            return False
+
+        new_body = []
+        for node in tree.body:
+            # For top-level function definitions, only filter those that look like tests.
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                if node.name.startswith('test') and is_failing(node.name):
+                    continue
+                new_body.append(node)
+            # For classes, filter out failing test methods but preserve other methods (e.g. setup).
+            elif isinstance(node, ast.ClassDef):
+                new_class_body = []
+                for subnode in node.body:
+                    if isinstance(subnode, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                        # Only consider filtering if the method is a test.
+                        qualified_name = f'{node.name}.{subnode.name}'
+                        if is_failing(subnode.name) or is_failing(qualified_name):
+                            continue
+                        new_class_body.append(subnode)
+                    else:
+                        new_class_body.append(subnode)
+                # Always include the class even if no test methods remain, as it might contain
+                # setup, teardown, or other necessary logic.
+                node.body = new_class_body
+                new_body.append(node)
+            else:
+                new_body.append(node)
+
+        tree.body = new_body
+
+        # Reconstruct the source code from the filtered AST.
+        # (Requires Python 3.9+ for ast.unparse; otherwise an exception will trigger the fallback.)
+        new_test_content = ast.unparse(tree)
+        return new_test_content, passing_tests, failing_tests
+
+    except Exception:
+        print('AST processing failed; falling back on regex-based filtering.')
+        # If AST processing fails for any reason, fall back on the original regex-based filtering.
+        return filter_passing_tests(test_content, test_output, repo)
diff --git a/evaluation/benchmarks/testgeneval/utils.py b/evaluation/benchmarks/testgeneval/utils.py
@@ -28,6 +28,7 @@ def get_test_directives(instance: TestGenEvalInstance) -> list:
 
     # For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing)
     if instance['repo'] == 'django/django':
+        directives = [instance['test_file']]
         directives_transformed = []
         for d in directives:
             d = d[: -len('.py')] if d.endswith('.py') else d