Skip to content

Commit df0dc34

Browse files
authored
Merge pull request #101 from commit-0/analysis_in_docs
submissions analysis + webpage rendering
2 parents 1b7780d + fc70756 commit df0dc34

File tree

6 files changed

+198
-45
lines changed

6 files changed

+198
-45
lines changed

docs/README.md

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
Update HF dataset then:
3+
```
4+
python docs/update_submissions_dataset.py
5+
```
6+
7+
Run submissions analysis on SPLIT
8+
```
9+
python docs/render_submissions.py
10+
--do_setup --get_blank_details --get_reference_details # only once, at beginning of setting up environment
11+
--analyze_submissions
12+
--split SPLIT
13+
```
14+
15+
Render webpages on submissions.
16+
```
17+
python docs/render_submissions.py --render_webpages --overwrite_previous_eval
18+
```
19+
20+
Deploy to website.
21+
```
22+
cd ../commit-0.github.io
23+
mkdocs gh-deploy --config-file ../commit0/mkdocs.yml --remote-branch main
24+
```

docs/javascripts/tablesort.js

+19-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
11
document$.subscribe(function() {
2-
var tables = document.querySelectorAll("article table:not([class])")
3-
tables.forEach(function(table) {
4-
new Tablesort(table)
5-
})
6-
})
2+
var tables = document.querySelectorAll("article table:not([class])")
3+
tables.forEach(function(table) {
4+
new Tablesort(table);
5+
// Automatically sort the table by the specified column
6+
var defaultSortColumn = 2; // Index of the column to sort (0-based)
7+
var isAscending = False; // Set to false for descending order
8+
9+
// Delay to ensure Tablesort is fully initialized
10+
setTimeout(function () {
11+
var header = table.querySelectorAll("thead th")[defaultSortColumn];
12+
if (header) {
13+
header.click(); // Simulate a click on the header
14+
if (!isAscending) {
15+
header.click(); // Click again for descending order
16+
}
17+
}
18+
}, 100);
19+
});
20+
});

docs/javascripts/tablesort.number.js

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
(function(){
2+
var cleanNumber = function(i) {
3+
return i.replace(/[^\-?0-9.]/g, '');
4+
},
5+
6+
compareNumber = function(a, b) {
7+
a = parseFloat(a);
8+
b = parseFloat(b);
9+
10+
a = isNaN(a) ? 0 : a;
11+
b = isNaN(b) ? 0 : b;
12+
13+
return a - b;
14+
};
15+
16+
Tablesort.extend('number', function(item) {
17+
return item.match(/^[-+]?[£\x24Û¢´]?\d+\s*([,\.]\d{0,2})/) || // Prefixed currency
18+
item.match(/^[-+]?\d+\s*([,\.]\d{0,2})?[£\x24Û¢´]/) || // Suffixed currency
19+
item.match(/^[-+]?(\d)*-?([,\.]){0,1}-?(\d)+([E,e][\-+][\d]+)?%?$/); // Number
20+
}, function(a, b) {
21+
a = cleanNumber(a);
22+
b = cleanNumber(b);
23+
24+
return compareNumber(b, a);
25+
});
26+
}());

docs/render_submissions.py

+96-32
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@
1313
from transformers import AutoTokenizer
1414

1515
from commit0.harness.constants import SPLIT
16+
from commit0.harness.get_pytest_ids import main as get_tests
1617
from commit0.harness.utils import clone_repo
1718
from commit0.cli import write_commit0_config_file
1819

1920
import logging
21+
from typing import Any, NoReturn
2022

2123
logging.basicConfig(
2224
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -26,9 +28,13 @@
2628
analysis_files_path = "/share/rush/commit0_analysis_temp"
2729

2830

29-
def get_pytest_info(path_to_logs, repo_name, branch_name):
31+
def get_pytest_info(
32+
path_to_logs: str, repo_name: str, branch_name: str
33+
) -> dict[str, dict[str, Any]]:
3034
pytest_info = {}
3135
for pytest_hash in os.listdir(path_to_logs):
36+
if not os.path.exists(os.path.join(path_to_logs, pytest_hash, "eval.sh")):
37+
continue
3238
eval_script = open(os.path.join(path_to_logs, pytest_hash, "eval.sh")).read()
3339
testname = re.search(r"([\S]+) > test_output", eval_script).group(1)
3440
patch_diff = open(os.path.join(path_to_logs, pytest_hash, "patch.diff")).read()
@@ -84,19 +90,19 @@ def get_pytest_info(path_to_logs, repo_name, branch_name):
8490
"failure_string": failure_string,
8591
"duration": duration,
8692
}
87-
return pytest_info
93+
return pytest_info if len(pytest_info) else "Could not evaluate"
8894

8995

90-
def get_coverage_info(path_to_logs, repo_name, branch_name):
96+
def get_coverage_info(path_to_logs: str, repo_name: str, branch_name: str) -> Any:
9197
raise NotImplementedError
9298

9399

94100
def get_blank_repo_metrics(
95-
blank_source_code_folder,
96-
spec_filename,
101+
blank_source_code_folder: str,
102+
spec_filename: str,
97103
tokenizer,
98104
code_file_filter=lambda filename: filename,
99-
):
105+
) -> dict[str, Any]:
100106
blank_repo_metrics = {
101107
"functions_to_edit": [],
102108
}
@@ -164,7 +170,7 @@ def get_blank_repo_metrics(
164170

165171

166172
leaderboard_header = """\n\n## Leaderboard ({split})
167-
| Name | Repos Resolved (/{num_repos}) | Total Tests Passed (/{total_num_tests}) | Test Duration (s) | Date | Analysis | Github |
173+
| Name | Repos Resolved (/{num_repos}) | Avg. pass rate | Test Duration (s) | Date | Analysis | Github |
168174
|------|:-------------------------:|:--------------------:|:--------------------:|:----------:|----|----| """
169175

170176
submission_table_header = """# Submission Name: **{display_name}** (split: {split})
@@ -178,33 +184,44 @@ def get_blank_repo_metrics(
178184
"""
179185

180186

181-
def render_mds(overwrite_previous, subfolder="docs"):
187+
def render_mds(overwrite_previous: bool, subfolder: str = "docs") -> NoReturn:
182188
leaderboard = {}
183189

184190
split_to_total_tests = {
185191
"lite": 3628,
186192
"all": 140926,
187193
} # hard-coded to skip running it later
188-
for split in tqdm.tqdm(["lite", "all"]):
194+
for split in ["lite", "all"]:
189195
num_repos = len(SPLIT[split])
190196
# total_num_tests = 0
191197
# for repo_name in SPLIT[split]:
192198
# repo_tests = subprocess.run(['commit0', 'get-tests', repo_name], capture_output=True, text=True).stdout.strip()
193199
# total_num_tests += len(repo_tests.splitlines())
194-
leaderboard[split] = leaderboard_header.format(
195-
split=split,
196-
num_repos=num_repos,
197-
total_num_tests=split_to_total_tests[split],
200+
leaderboard[split] = []
201+
leaderboard[split].append(
202+
(
203+
split_to_total_tests[split] + 1,
204+
leaderboard_header.format(
205+
split=split,
206+
num_repos=num_repos,
207+
total_num_tests=split_to_total_tests[split],
208+
),
209+
)
198210
)
199211

200212
for org_path in tqdm.tqdm(glob.glob(os.path.join(analysis_files_path, "*"))):
201213
org_name = os.path.basename(org_path)
202214
if org_name in {"blank", "repos", "submission_repos"}:
203215
continue
204216
for branch_path in glob.glob(os.path.join(org_path, "*.json")):
205-
cum_tests_passed = 0
217+
evaluate_numbers = []
218+
lite_evaluate_numbers = []
219+
# cum_tests_passed = 0
206220
repos_resolved = 0
207221
total_duration = 0.0
222+
# lite_cum_tests_passed = 0
223+
lite_repos_resolved = 0
224+
lite_total_duration = 0.0
208225
branch_metrics = json.load(open(branch_path))
209226
submission_info = branch_metrics["submission_info"]
210227
split = submission_info["split"]
@@ -234,7 +251,7 @@ def render_mds(overwrite_previous, subfolder="docs"):
234251
subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md"
235252
)
236253
if isinstance(repo_pytest_results, str):
237-
submission_repo_page = f"# **{display_name}**: {repo_name}\n\n## Failed to clone\n\n{repo_pytest_results}"
254+
submission_repo_page = f"# **{display_name}**: {repo_name}\n\n## Failed\n\n{repo_pytest_results}"
238255
org_branch_repo_filepath = os.path.join(
239256
subfolder, f"analysis_{org_name}_{branch_name}_{repo_name}.md"
240257
)
@@ -246,7 +263,7 @@ def render_mds(overwrite_previous, subfolder="docs"):
246263
submission_page = submission_table_header.format(
247264
display_name=display_name, split=split
248265
) + (
249-
f"\n| {repo_name} | No; Failed to clone. | - | - | "
266+
f"\n| {repo_name} | No; {repo_pytest_results} | - | - | "
250267
f"[Analysis](/{f'analysis_{org_name}_{branch_name}_{repo_name}'}) | "
251268
f"[Github]({github_hyperlink}) |"
252269
)
@@ -267,13 +284,23 @@ def render_mds(overwrite_previous, subfolder="docs"):
267284
)
268285
pytest_details = "Pytest failed"
269286
duration = "Failed."
287+
evaluate_numbers.append(0.0)
288+
if split == "all" and repo_name in SPLIT["lite"]:
289+
lite_evaluate_numbers.append(0.0)
270290
else:
271291
resolved = False
272292
if "passed" in pytest_info["summary"]:
273293
if "skipped" in pytest_info["summary"]:
274-
resolved = pytest_info["summary"]["passed"] + pytest_info["summary"]["skipped"] == pytest_info["summary"]["total"]
294+
resolved = (
295+
pytest_info["summary"]["passed"]
296+
+ pytest_info["summary"]["skipped"]
297+
== pytest_info["summary"]["total"]
298+
)
275299
else:
276-
resolved = pytest_info["summary"]["passed"] == pytest_info["summary"]["total"]
300+
resolved = (
301+
pytest_info["summary"]["passed"]
302+
== pytest_info["summary"]["total"]
303+
)
277304
if write_submission:
278305
submission_repo_page += pytest_summary_table_header.format(
279306
pytest_group=pytest_group
@@ -295,9 +322,21 @@ def render_mds(overwrite_previous, subfolder="docs"):
295322
f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}"
296323
f"</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
297324
)
298-
cum_tests_passed += pytest_info["summary"]["passed"]
325+
# cum_tests_passed += pytest_info["summary"]["passed"]
326+
num_tests = len(get_tests(repo_name, verbose=0))
327+
evaluate_numbers.append(
328+
pytest_info["summary"]["passed"] / num_tests
329+
)
299330
total_duration += pytest_info["duration"]
300331
repos_resolved += int(resolved)
332+
if split == "all" and repo_name in SPLIT["lite"]:
333+
lite_evaluate_numbers.append(
334+
pytest_info["summary"]["passed"] / num_tests
335+
)
336+
# lite_cum_tests_passed += pytest_info["summary"]["passed"]
337+
lite_total_duration += pytest_info["duration"]
338+
lite_repos_resolved += int(resolved)
339+
301340
if write_submission:
302341
pytest_details = f"{pytest_info['summary']['passed']} / {pytest_info['summary']['total']}"
303342
duration = f"{pytest_info['duration']:.2f}"
@@ -322,22 +361,46 @@ def render_mds(overwrite_previous, subfolder="docs"):
322361
wf.write(back_button + "\n" + submission_page)
323362
analysis_link = f"[Analysis](/{f'analysis_{org_name}_{branch_name}'})"
324363
github_link = f"[Github]({project_page_link})"
325-
leaderboard[split] += (
326-
f"\n|{display_name}|"
327-
f"{repos_resolved}|"
328-
f"{cum_tests_passed}|"
329-
f"{total_duration:.2f}|"
330-
f"{submission_date}|"
331-
f"{analysis_link}|"
332-
f"{github_link}|"
364+
avg_pass_rate = sum(evaluate_numbers) / len(evaluate_numbers)
365+
leaderboard[split].append(
366+
(
367+
avg_pass_rate * 100,
368+
f"\n|{display_name}|"
369+
f"{repos_resolved}|"
370+
f"{avg_pass_rate*100:.2f}%|"
371+
f"{total_duration:.2f}|"
372+
f"{submission_date}|"
373+
f"{analysis_link}|"
374+
f"{github_link}|",
375+
)
333376
)
377+
if (split == "all") and ("Reference (Gold)" not in display_name):
378+
avg_lite_pass_rate = sum(lite_evaluate_numbers) / len(
379+
lite_evaluate_numbers
380+
)
381+
leaderboard["lite"].append(
382+
(
383+
avg_lite_pass_rate * 100,
384+
f"\n|{display_name} (subset of `all`)|"
385+
f"{lite_repos_resolved}|"
386+
f"{avg_lite_pass_rate*100:.2f}%|"
387+
f"{lite_total_duration:.2f}|"
388+
f"{submission_date}|"
389+
f"{analysis_link}|"
390+
f"{github_link}|",
391+
)
392+
)
334393

335394
leaderboard_filepath = os.path.join(subfolder, "analysis.md")
395+
for split in ["lite", "all"]:
396+
leaderboard[split] = sorted(leaderboard[split], key=lambda elt: -elt[0])
336397
with open(leaderboard_filepath, "w") as wf:
337-
wf.write(leaderboard["lite"] + "\n\n" + leaderboard["all"])
398+
lite_leaderboard_string = "".join(string for (_, string) in leaderboard["lite"])
399+
all_leaderboard_string = "".join(string for (_, string) in leaderboard["all"])
400+
wf.write(lite_leaderboard_string + "\n\n" + all_leaderboard_string)
338401

339402

340-
def get_args():
403+
def get_args() -> argparse.Namespace:
341404
parser = argparse.ArgumentParser()
342405
parser.add_argument(
343406
"--do_setup", action="store_true", help="Run commit0 setup with specified split"
@@ -366,14 +429,14 @@ def get_args():
366429
parser.add_argument(
367430
"--overwrite_previous_eval",
368431
action="store_true",
369-
help="Overwrite cached pytest info"
432+
help="Overwrite cached pytest info",
370433
# TODO add finer granularity so can specify which ones to overwrite
371434
)
372435

373436
return parser.parse_args()
374437

375438

376-
def main(args):
439+
def main(args: argparse.Namespace) -> NoReturn:
377440
global analysis_files_path
378441

379442
commit0_dataset_name = "wentingzhao/commit0_combined"
@@ -493,6 +556,7 @@ def main(args):
493556
)
494557
if os.path.exists(submission_repos_path):
495558
shutil.rmtree(submission_repos_path)
559+
print(f"Removed existing at {submission_repos_path}")
496560
os.makedirs(os.path.join(analysis_files_path, org_name), exist_ok=True)
497561
commit0_config_file = os.path.join(
498562
analysis_files_path,
@@ -530,7 +594,7 @@ def main(args):
530594
)
531595
# run pytests
532596
os.system(
533-
f"commit0 evaluate --branch {branch_name} "
597+
f"commit0 evaluate --branch {branch_name} --timeout 1800"
534598
f"--commit0-config-file {commit0_config_file}"
535599
)
536600
for example in dataset:

docs/update_submissions_dataset.py

+31-7
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,36 @@
11
from datasets import Dataset
22

33
submissions = {
4-
"org_name": ["test-save-commit0", "commit0-lite-with-test", "commit0-lite-plain", "commit0-all-plain"],
5-
"branch": ["baseline", "fillin", "fillin", "fillin"],
6-
"display_name": ["Claude Sonnet 3.5 - Base", "Claude Sonnet 3.5 - Fill-in + Unit Test Feedback", "Claude Sonnet 3.5 - Fill-in", "Claude Sonnet 3.5 - Fill-in"],
7-
"submission_date": ["09/25/2024", "09/25/2024", "09/25/2024", "09/25/2024"],
8-
"split": ["lite", "lite", "lite", "all"],
9-
"project_page": ["https://github.com/test-save-commit0", "https://github.com/commit0-lite-with-test", "https://github.com/commit0-lite-plain", "https://github.com/commit0-all-plain"]
4+
"org_name": [
5+
"test-save-commit0",
6+
"commit0-fillin",
7+
"commit0-lite-test",
8+
"openhands-commit0",
9+
"sweagent-commit0",
10+
],
11+
"branch": ["baseline", "sonnet", "sonnet", "openhands", "sweagent"],
12+
"display_name": [
13+
"Claude Sonnet 3.5 - Base",
14+
"Claude Sonnet 3.5 - Fill-in",
15+
"Claude Sonnet 3.5 - Fill-in + Lint & Unit Test Feedback",
16+
"OpenHands",
17+
"SWE-Agent",
18+
],
19+
"submission_date": [
20+
"09/25/2024",
21+
"09/25/2024",
22+
"09/25/2024",
23+
"11/25/2024",
24+
"11/26/2024",
25+
],
26+
"split": ["lite", "all", "lite", "all", "lite"],
27+
"project_page": [
28+
"https://github.com/test-save-commit0",
29+
"https://github.com/commit0-fillin",
30+
"https://github.com/commit0-lite-test",
31+
"https://github.com/openhands-commit0",
32+
"https://github.com/sweagent-commit0",
33+
],
1034
}
1135

12-
Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions")
36+
Dataset.from_dict(submissions).push_to_hub("celinelee/commit0_submissions")

0 commit comments

Comments
 (0)