Merge pull request #101 from commit-0/analysis_in_docs

wenting-zhao · web-flow · commit df0dc3496fea · 2024-12-06T20:27:33.000-05:00
submissions analysis + webpage rendering
diff --git a/docs/README.md b/docs/README.md
@@ -0,0 +1,24 @@
+
+Update HF dataset then:
+```
+python docs/update_submissions_dataset.py
+```
+
+Run submissions analysis on SPLIT
+```
+python docs/render_submissions.py
+                       --do_setup --get_blank_details --get_reference_details # only once, at beginning of setting up environment
+                       --analyze_submissions
+                       --split SPLIT
+```
+
+Render webpages on submissions.
+```
+python docs/render_submissions.py --render_webpages --overwrite_previous_eval
+```
+
+Deploy to website.
+```
+cd ../commit-0.github.io
+mkdocs gh-deploy --config-file ../commit0/mkdocs.yml --remote-branch main
+```
diff --git a/docs/javascripts/tablesort.js b/docs/javascripts/tablesort.js
@@ -1,6 +1,20 @@
 document$.subscribe(function() {
-    var tables = document.querySelectorAll("article table:not([class])")
-    tables.forEach(function(table) {
-      new Tablesort(table)
-    })
-  })
+  var tables = document.querySelectorAll("article table:not([class])")
+  tables.forEach(function(table) {
+    new Tablesort(table);
+    // Automatically sort the table by the specified column
+    var defaultSortColumn = 2; // Index of the column to sort (0-based)
+    var isAscending = False;   // Set to false for descending order
+
+    // Delay to ensure Tablesort is fully initialized
+    setTimeout(function () {
+      var header = table.querySelectorAll("thead th")[defaultSortColumn];
+      if (header) {
+        header.click(); // Simulate a click on the header
+        if (!isAscending) {
+          header.click(); // Click again for descending order
+        }
+      }
+    }, 100);
+  });
+});
diff --git a/docs/javascripts/tablesort.number.js b/docs/javascripts/tablesort.number.js
@@ -0,0 +1,26 @@
+(function(){
+    var cleanNumber = function(i) {
+      return i.replace(/[^\-?0-9.]/g, '');
+    },
+
+    compareNumber = function(a, b) {
+      a = parseFloat(a);
+      b = parseFloat(b);
+
+      a = isNaN(a) ? 0 : a;
+      b = isNaN(b) ? 0 : b;
+
+      return a - b;
+    };
+
+    Tablesort.extend('number', function(item) {
+      return item.match(/^[-+]?[£\x24Û¢´€]?\d+\s*([,\.]\d{0,2})/) || // Prefixed currency
+        item.match(/^[-+]?\d+\s*([,\.]\d{0,2})?[£\x24Û¢´€]/) || // Suffixed currency
+        item.match(/^[-+]?(\d)*-?([,\.]){0,1}-?(\d)+([E,e][\-+][\d]+)?%?$/); // Number
+    }, function(a, b) {
+      a = cleanNumber(a);
+      b = cleanNumber(b);
+
+      return compareNumber(b, a);
+    });
+  }());
diff --git a/docs/render_submissions.py b/docs/render_submissions.py
@@ -13,10 +13,12 @@
 from transformers import AutoTokenizer
 
 from commit0.harness.constants import SPLIT
+from commit0.harness.get_pytest_ids import main as get_tests
 from commit0.harness.utils import clone_repo
 from commit0.cli import write_commit0_config_file
 
 import logging
+from typing import Any, NoReturn
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -26,9 +28,13 @@
 analysis_files_path = "/share/rush/commit0_analysis_temp"
 
 
-def get_pytest_info(path_to_logs, repo_name, branch_name):
+def get_pytest_info(
+    path_to_logs: str, repo_name: str, branch_name: str
+) -> dict[str, dict[str, Any]]:
     pytest_info = {}
     for pytest_hash in os.listdir(path_to_logs):
+        if not os.path.exists(os.path.join(path_to_logs, pytest_hash, "eval.sh")):
+            continue
         eval_script = open(os.path.join(path_to_logs, pytest_hash, "eval.sh")).read()
         testname = re.search(r"([\S]+) > test_output", eval_script).group(1)
         patch_diff = open(os.path.join(path_to_logs, pytest_hash, "patch.diff")).read()
@@ -84,19 +90,19 @@ def get_pytest_info(path_to_logs, repo_name, branch_name):
                 "failure_string": failure_string,
                 "duration": duration,
             }
-    return pytest_info
+    return pytest_info if len(pytest_info) else "Could not evaluate"
 
 
-def get_coverage_info(path_to_logs, repo_name, branch_name):
+def get_coverage_info(path_to_logs: str, repo_name: str, branch_name: str) -> Any:
     raise NotImplementedError
 
 
 def get_blank_repo_metrics(
-    blank_source_code_folder,
-    spec_filename,
+    blank_source_code_folder: str,
+    spec_filename: str,
     tokenizer,
     code_file_filter=lambda filename: filename,
-):
+) -> dict[str, Any]:
     blank_repo_metrics = {
         "functions_to_edit": [],
     }
@@ -164,7 +170,7 @@ def get_blank_repo_metrics(
 
 
 leaderboard_header = """\n\n## Leaderboard ({split})
-| Name | Repos Resolved (/{num_repos}) | Total Tests Passed (/{total_num_tests}) | Test Duration (s) | Date | Analysis | Github |
+| Name | Repos Resolved (/{num_repos}) | Avg. pass rate | Test Duration (s) | Date | Analysis | Github |
 |------|:-------------------------:|:--------------------:|:--------------------:|:----------:|----|----| """
 
 submission_table_header = """# Submission Name: **{display_name}** (split: {split})
@@ -178,33 +184,44 @@ def get_blank_repo_metrics(
 """
 
 
-def render_mds(overwrite_previous, subfolder="docs"):
+def render_mds(overwrite_previous: bool, subfolder: str = "docs") -> NoReturn:
     leaderboard = {}
 
     split_to_total_tests = {
         "lite": 3628,
         "all": 140926,
     }  # hard-coded to skip running it later
-    for split in tqdm.tqdm(["lite", "all"]):
+    for split in ["lite", "all"]:
         num_repos = len(SPLIT[split])
         # total_num_tests = 0
         # for repo_name in SPLIT[split]:
         #     repo_tests = subprocess.run(['commit0', 'get-tests', repo_name], capture_output=True, text=True).stdout.strip()
         #     total_num_tests += len(repo_tests.splitlines())
-        leaderboard[split] = leaderboard_header.format(
-            split=split,
-            num_repos=num_repos,
-            total_num_tests=split_to_total_tests[split],
+        leaderboard[split] = []
+        leaderboard[split].append(
+            (
+                split_to_total_tests[split] + 1,
+                leaderboard_header.format(
+                    split=split,
+                    num_repos=num_repos,
+                    total_num_tests=split_to_total_tests[split],
+                ),
+            )
         )
 
     for org_path in tqdm.tqdm(glob.glob(os.path.join(analysis_files_path, "*"))):
         org_name = os.path.basename(org_path)
         if org_name in {"blank", "repos", "submission_repos"}:
             continue
         for branch_path in glob.glob(os.path.join(org_path, "*.json")):
-            cum_tests_passed = 0
+            evaluate_numbers = []
+            lite_evaluate_numbers = []
+            # cum_tests_passed = 0
             repos_resolved = 0
             total_duration = 0.0
+            # lite_cum_tests_passed = 0
+            lite_repos_resolved = 0
+            lite_total_duration = 0.0
             branch_metrics = json.load(open(branch_path))
             submission_info = branch_metrics["submission_info"]
             split = submission_info["split"]
@@ -234,7 +251,7 @@ def render_mds(overwrite_previous, subfolder="docs"):
                         subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md"
                     )
                 if isinstance(repo_pytest_results, str):
-                    submission_repo_page = f"# **{display_name}**: {repo_name}\n\n## Failed to clone\n\n{repo_pytest_results}"
+                    submission_repo_page = f"# **{display_name}**: {repo_name}\n\n## Failed\n\n{repo_pytest_results}"
                     org_branch_repo_filepath = os.path.join(
                         subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md"
                     )
@@ -246,7 +263,7 @@ def render_mds(overwrite_previous, subfolder="docs"):
                     submission_page = submission_table_header.format(
                         display_name=display_name, split=split
                     ) + (
-                        f"\n| {repo_name} | No; Failed to clone. | - | - | "
+                        f"\n| {repo_name} | No; {repo_pytest_results} | - | - | "
                         f"[Analysis](/{f'analysis_{org_name}_{branch_name}_{repo_name}'}) | "
                         f"[Github]({github_hyperlink}) |"
                     )
@@ -267,13 +284,23 @@ def render_mds(overwrite_previous, subfolder="docs"):
                             )
                             pytest_details = "Pytest failed"
                             duration = "Failed."
+                        evaluate_numbers.append(0.0)
+                        if split == "all" and repo_name in SPLIT["lite"]:
+                            lite_evaluate_numbers.append(0.0)
                     else:
                         resolved = False
                         if "passed" in pytest_info["summary"]:
                             if "skipped" in pytest_info["summary"]:
-                                resolved = pytest_info["summary"]["passed"] + pytest_info["summary"]["skipped"] == pytest_info["summary"]["total"]
+                                resolved = (
+                                    pytest_info["summary"]["passed"]
+                                    + pytest_info["summary"]["skipped"]
+                                    == pytest_info["summary"]["total"]
+                                )
                             else:
-                                resolved = pytest_info["summary"]["passed"] == pytest_info["summary"]["total"]
+                                resolved = (
+                                    pytest_info["summary"]["passed"]
+                                    == pytest_info["summary"]["total"]
+                                )
                         if write_submission:
                             submission_repo_page += pytest_summary_table_header.format(
                                 pytest_group=pytest_group
@@ -295,9 +322,21 @@ def render_mds(overwrite_previous, subfolder="docs"):
                                     f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}"
                                     f"</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
                                 )
-                        cum_tests_passed += pytest_info["summary"]["passed"]
+                        # cum_tests_passed += pytest_info["summary"]["passed"]
+                        num_tests = len(get_tests(repo_name, verbose=0))
+                        evaluate_numbers.append(
+                            pytest_info["summary"]["passed"] / num_tests
+                        )
                         total_duration += pytest_info["duration"]
                         repos_resolved += int(resolved)
+                        if split == "all" and repo_name in SPLIT["lite"]:
+                            lite_evaluate_numbers.append(
+                                pytest_info["summary"]["passed"] / num_tests
+                            )
+                            # lite_cum_tests_passed += pytest_info["summary"]["passed"]
+                            lite_total_duration += pytest_info["duration"]
+                            lite_repos_resolved += int(resolved)
+
                         if write_submission:
                             pytest_details = f"{pytest_info['summary']['passed']} / {pytest_info['summary']['total']}"
                             duration = f"{pytest_info['duration']:.2f}"
@@ -322,22 +361,46 @@ def render_mds(overwrite_previous, subfolder="docs"):
                     wf.write(back_button + "\n" + submission_page)
             analysis_link = f"[Analysis](/{f'analysis_{org_name}_{branch_name}'})"
             github_link = f"[Github]({project_page_link})"
-            leaderboard[split] += (
-                f"\n|{display_name}|"
-                f"{repos_resolved}|"
-                f"{cum_tests_passed}|"
-                f"{total_duration:.2f}|"
-                f"{submission_date}|"
-                f"{analysis_link}|"
-                f"{github_link}|"
+            avg_pass_rate = sum(evaluate_numbers) / len(evaluate_numbers)
+            leaderboard[split].append(
+                (
+                    avg_pass_rate * 100,
+                    f"\n|{display_name}|"
+                    f"{repos_resolved}|"
+                    f"{avg_pass_rate*100:.2f}%|"
+                    f"{total_duration:.2f}|"
+                    f"{submission_date}|"
+                    f"{analysis_link}|"
+                    f"{github_link}|",
+                )
             )
+            if (split == "all") and ("Reference (Gold)" not in display_name):
+                avg_lite_pass_rate = sum(lite_evaluate_numbers) / len(
+                    lite_evaluate_numbers
+                )
+                leaderboard["lite"].append(
+                    (
+                        avg_lite_pass_rate * 100,
+                        f"\n|{display_name} (subset of `all`)|"
+                        f"{lite_repos_resolved}|"
+                        f"{avg_lite_pass_rate*100:.2f}%|"
+                        f"{lite_total_duration:.2f}|"
+                        f"{submission_date}|"
+                        f"{analysis_link}|"
+                        f"{github_link}|",
+                    )
+                )
 
     leaderboard_filepath = os.path.join(subfolder, "analysis.md")
+    for split in ["lite", "all"]:
+        leaderboard[split] = sorted(leaderboard[split], key=lambda elt: -elt[0])
     with open(leaderboard_filepath, "w") as wf:
-        wf.write(leaderboard["lite"] + "\n\n" + leaderboard["all"])
+        lite_leaderboard_string = "".join(string for (_, string) in leaderboard["lite"])
+        all_leaderboard_string = "".join(string for (_, string) in leaderboard["all"])
+        wf.write(lite_leaderboard_string + "\n\n" + all_leaderboard_string)
 
 
-def get_args():
+def get_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--do_setup", action="store_true", help="Run commit0 setup with specified split"
@@ -366,14 +429,14 @@ def get_args():
     parser.add_argument(
         "--overwrite_previous_eval",
         action="store_true",
-        help="Overwrite cached pytest info"
+        help="Overwrite cached pytest info",
         # TODO add finer granularity so can specify which ones to overwrite
     )
 
     return parser.parse_args()
 
 
-def main(args):
+def main(args: argparse.Namespace) -> NoReturn:
     global analysis_files_path
 
     commit0_dataset_name = "wentingzhao/commit0_combined"
@@ -493,6 +556,7 @@ def main(args):
             )
             if os.path.exists(submission_repos_path):
                 shutil.rmtree(submission_repos_path)
+                print(f"Removed existing at {submission_repos_path}")
             os.makedirs(os.path.join(analysis_files_path, org_name), exist_ok=True)
             commit0_config_file = os.path.join(
                 analysis_files_path,
@@ -530,7 +594,7 @@ def main(args):
             )
             # run pytests
             os.system(
-                f"commit0 evaluate --branch {branch_name} "
+                f"commit0 evaluate --branch {branch_name} --timeout 1800"
                 f"--commit0-config-file {commit0_config_file}"
             )
             for example in dataset:
diff --git a/docs/update_submissions_dataset.py b/docs/update_submissions_dataset.py
@@ -1,12 +1,36 @@
 from datasets import Dataset
 
 submissions = {
-    "org_name": ["test-save-commit0", "commit0-lite-with-test", "commit0-lite-plain", "commit0-all-plain"],
-    "branch": ["baseline", "fillin", "fillin", "fillin"],
-    "display_name": ["Claude Sonnet 3.5 - Base", "Claude Sonnet 3.5 - Fill-in + Unit Test Feedback", "Claude Sonnet 3.5 - Fill-in", "Claude Sonnet 3.5 - Fill-in"],
-    "submission_date": ["09/25/2024", "09/25/2024", "09/25/2024", "09/25/2024"],
-    "split": ["lite", "lite", "lite", "all"],
-    "project_page": ["https://github.com/test-save-commit0", "https://github.com/commit0-lite-with-test", "https://github.com/commit0-lite-plain", "https://github.com/commit0-all-plain"]
+    "org_name": [
+        "test-save-commit0",
+        "commit0-fillin",
+        "commit0-lite-test",
+        "openhands-commit0",
+        "sweagent-commit0",
+    ],
+    "branch": ["baseline", "sonnet", "sonnet", "openhands", "sweagent"],
+    "display_name": [
+        "Claude Sonnet 3.5 - Base",
+        "Claude Sonnet 3.5 - Fill-in",
+        "Claude Sonnet 3.5 - Fill-in + Lint & Unit Test Feedback",
+        "OpenHands",
+        "SWE-Agent",
+    ],
+    "submission_date": [
+        "09/25/2024",
+        "09/25/2024",
+        "09/25/2024",
+        "11/25/2024",
+        "11/26/2024",
+    ],
+    "split": ["lite", "all", "lite", "all", "lite"],
+    "project_page": [
+        "https://github.com/test-save-commit0",
+        "https://github.com/commit0-fillin",
+        "https://github.com/commit0-lite-test",
+        "https://github.com/openhands-commit0",
+        "https://github.com/sweagent-commit0",
+    ],
 }
 
-Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions")
+Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions")
diff --git a/mkdocs.yml b/mkdocs.yml