From 9c60d55e77c2f60c2f43d0dddfdbcd1e6e96bf46 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 5 Nov 2024 16:50:33 -0500 Subject: [PATCH 01/24] Preliminary dataset creation script --- scripts/build_failure_data_collection.py | 91 ++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 scripts/build_failure_data_collection.py diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py new file mode 100644 index 0000000000..2f266df998 --- /dev/null +++ b/scripts/build_failure_data_collection.py @@ -0,0 +1,91 @@ +import logging + +from tqdm import tqdm + +from bugbug import bugzilla, db, phabricator, repository + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def download_databases(): + logger.info("Downloading bugs database...") + assert db.download(bugzilla.BUGS_DB) + + logger.info("Downloading commits database...") + assert db.download(repository.COMMITS_DB, support_files_too=True) + + +def preprocess_commits_and_bugs(): + logger.info("Preprocessing commits and bugs...") + bug_commits = {} + + for commit in tqdm( + repository.get_commits( + include_no_bug=True, include_backouts=True, include_ignored=True + ) + ): + commit_data = { + key: commit[key] + for key in ["node", "bug_id", "pushdate", "backedoutby", "backsout"] + } + + bug_commits.setdefault(commit["bug_id"], []).append(commit_data) + + return bug_commits + + +def find_bugs(bug_commits): + for bug in bugzilla.get_bugs(include_invalid=True): + if caused_build_failure(bug["comments"]) and check_backed_out( + bug_commits.get(bug["id"], None) + ): + print(f"BUG: {bug["id"]}") + print(f"REVISIONS: {bugzilla.get_revision_ids(bug)}") + + +def caused_build_failure(comments): + for comment in comments: + if "backed out" in comment["text"] and "build" in comment["text"]: + return True + return False + + +def check_backed_out(commits): + if not commits: + return False + + for commit in commits: + if commit["backedoutby"]: + return True + return False + + +def load_bug_to_revisions(): + bug_to_revisions = {} + + for revision in phabricator.get_revisions(): + bug_id = revision["fields"].get("bugzilla.bug-id") + if bug_id is not None: + if bug_id not in bug_to_revisions: + bug_to_revisions[bug_id] = [] + bug_to_revisions[bug_id].append(revision) + return bug_to_revisions + + +def main(): + download_databases() + # bug_to_revisions = load_bug_to_revisions() + bug_commits = preprocess_commits_and_bugs() + find_bugs(bug_commits) + + +if __name__ == "__main__": + main() + + +# collect bugs with build failures X +# identify the patch that was backed out + caused build failure +# identify the patch that backs it out +# identify the patch after the background +# include the initial diff/patch, the error, and the interdiff between the initial diff and the fix diff From c3c98c00c80156fcd0c569b495d1ff47616b7b0e Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 6 Nov 2024 16:35:13 -0500 Subject: [PATCH 02/24] Added revision finder for backed out commits --- scripts/build_failure_data_collection.py | 96 +++++++++++++++++------- 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 2f266df998..de3ded1920 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -1,5 +1,7 @@ import logging +import re +from libmozdata.hgmozilla import Revision from tqdm import tqdm from bugbug import bugzilla, db, phabricator, repository @@ -35,13 +37,37 @@ def preprocess_commits_and_bugs(): return bug_commits -def find_bugs(bug_commits): +def preprocess_revisions(): + logger.info("Preprocessing revisions...") + diff_id_to_phid = {} + + for revision in phabricator.get_revisions(): + diff_id_to_phid[revision["id"]] = revision["phid"] + + return diff_id_to_phid + + +def find_bugs(bug_commits, hg_client): for bug in bugzilla.get_bugs(include_invalid=True): - if caused_build_failure(bug["comments"]) and check_backed_out( - bug_commits.get(bug["id"], None) - ): + if caused_build_failure(bug["comments"]): + backing_out_commit = find_backing_out_commit( + bug_commits.get(bug["id"], None), hg_client + ) + if not backing_out_commit: + continue + print(f"BUG: {bug["id"]}") - print(f"REVISIONS: {bugzilla.get_revision_ids(bug)}") + print(f"BACKING OUT COMMIT: {backing_out_commit['node']}") + print(f"BACKED OUT COMMIT: {backing_out_commit['backsout']}") + + desc = hg_client.get_revision("nightly", backing_out_commit["backsout"])[ + "desc" + ] + # print(f"DESCRIPTION OF BACKED OUT COMMIT: {desc}") + + print(f"PHABRICATOR REVISION ID: {extract_revision_id(desc)}") + + # backed_out_bugs.append(bug, backing_out_commit, ) def caused_build_failure(comments): @@ -51,41 +77,59 @@ def caused_build_failure(comments): return False -def check_backed_out(commits): +def find_backing_out_commit(commits, hg_client): if not commits: - return False + return None for commit in commits: - if commit["backedoutby"]: - return True - return False + if not commit["backsout"]: + continue + desc = hg_client.get_revision("nightly", commit["node"])["desc"] + if "backed out" in desc.lower() and "build" in desc.lower(): + return commit + return None -def load_bug_to_revisions(): - bug_to_revisions = {} - for revision in phabricator.get_revisions(): - bug_id = revision["fields"].get("bugzilla.bug-id") - if bug_id is not None: - if bug_id not in bug_to_revisions: - bug_to_revisions[bug_id] = [] - bug_to_revisions[bug_id].append(revision) - return bug_to_revisions +def extract_revision_id(desc): + match = re.search(r"https://phabricator\.services\.mozilla\.com/(D\d+)", desc) + if match: + return match.group(1) + return None def main(): download_databases() - # bug_to_revisions = load_bug_to_revisions() + bug_commits = preprocess_commits_and_bugs() - find_bugs(bug_commits) + # rev_id_to_phid = preprocess_revisions() + + hg_client = Revision() + + find_bugs(bug_commits, hg_client) + + # test = Revision() + # rev = (test.get_revision("nightly", "2e49f991daa3e6b8fb0c1f3ff803ab06b4ec45d6")) + # if "backed out" in rev["desc"].lower() and "build" in rev["desc"].lower(): + # print("yes") if __name__ == "__main__": main() +# collect bugs with build failures, along with a list of their revisions X +# identify commit that backs out another commit due to build failure X +# from above, we can also get the node of the commit that caused the backout X +# extract the revision ID from initial commit description +# use this to associate the commit nodes to the diff IDS +# figure out a way to find the fix patch + + +# once we have the initial and fix patch IDs, we can get the interdiff between them +# we can also get the error message from the initial patch, to find the exact lines .... + -# collect bugs with build failures X -# identify the patch that was backed out + caused build failure -# identify the patch that backs it out -# identify the patch after the background -# include the initial diff/patch, the error, and the interdiff between the initial diff and the fix diff +# find the commit that happened most recently after the backout --> this is a fix commit +# convert commit id to commit phid (this is a thing in the revision object in phabricator.get_revisions) +# associate commit phid with its revision in phab --> and then associate it with its patch id +# get interdiff between the initial patch and the fix patch From cb312286f3bb60c0d03e6a9ba24ee1609204d5d7 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 12 Nov 2024 09:28:47 -0500 Subject: [PATCH 03/24] Added return bug array --- scripts/build_failure_data_collection.py | 36 +++++++++++++----------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index de3ded1920..0883505348 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -1,4 +1,5 @@ import logging +import os import re from libmozdata.hgmozilla import Revision @@ -48,6 +49,8 @@ def preprocess_revisions(): def find_bugs(bug_commits, hg_client): + backed_out_bugs = [] + for bug in bugzilla.get_bugs(include_invalid=True): if caused_build_failure(bug["comments"]): backing_out_commit = find_backing_out_commit( @@ -56,18 +59,15 @@ def find_bugs(bug_commits, hg_client): if not backing_out_commit: continue - print(f"BUG: {bug["id"]}") - print(f"BACKING OUT COMMIT: {backing_out_commit['node']}") - print(f"BACKED OUT COMMIT: {backing_out_commit['backsout']}") - desc = hg_client.get_revision("nightly", backing_out_commit["backsout"])[ "desc" ] - # print(f"DESCRIPTION OF BACKED OUT COMMIT: {desc}") - print(f"PHABRICATOR REVISION ID: {extract_revision_id(desc)}") + revision_id = extract_revision_id(desc) - # backed_out_bugs.append(bug, backing_out_commit, ) + backed_out_bugs.append((bug, backing_out_commit, revision_id)) + + return backed_out_bugs def caused_build_failure(comments): @@ -106,24 +106,26 @@ def main(): hg_client = Revision() - find_bugs(bug_commits, hg_client) + bugs = find_bugs(bug_commits, hg_client) - # test = Revision() - # rev = (test.get_revision("nightly", "2e49f991daa3e6b8fb0c1f3ff803ab06b4ec45d6")) - # if "backed out" in rev["desc"].lower() and "build" in rev["desc"].lower(): - # print("yes") + for bug in bugs: + print(bug[2]) if __name__ == "__main__": - main() + # main() + phabricator.set_api_key( + os.getenv("PHABRICATOR_URL", "default_url"), + os.getenv("PHABRICATOR_API_KEY", "default_key"), + ) + print(phabricator.get_transactions("D128537")) # collect bugs with build failures, along with a list of their revisions X # identify commit that backs out another commit due to build failure X # from above, we can also get the node of the commit that caused the backout X -# extract the revision ID from initial commit description -# use this to associate the commit nodes to the diff IDS -# figure out a way to find the fix patch - +# extract the revision ID from initial commit description X +# use this to associate the commit nodes to the diff IDS --> find the commit before and after the backout -- check diff description? +# or alternatively, check when the reverting change was made (specifically for the build failure backout) --> get the diff before and after this timestamp with a desc with a commit to MOZILLACENTRAL # once we have the initial and fix patch IDs, we can get the interdiff between them # we can also get the error message from the initial patch, to find the exact lines .... From 23ebc087629a50e30269485282eab938a0d88890 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 12 Nov 2024 12:12:09 -0500 Subject: [PATCH 04/24] Added commit node finder, removed phabricator requirement --- scripts/build_failure_data_collection.py | 92 ++++++++++++++++-------- 1 file changed, 62 insertions(+), 30 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 0883505348..8a64acc607 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -1,8 +1,6 @@ import logging -import os -import re +from collections import defaultdict -from libmozdata.hgmozilla import Revision from tqdm import tqdm from bugbug import bugzilla, db, phabricator, repository @@ -18,6 +16,9 @@ def download_databases(): logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) + logger.info("Downloading revisions database...") + assert db.download(phabricator.REVISIONS_DB, support_files_too=True) + def preprocess_commits_and_bugs(): logger.info("Preprocessing commits and bugs...") @@ -59,11 +60,13 @@ def find_bugs(bug_commits, hg_client): if not backing_out_commit: continue - desc = hg_client.get_revision("nightly", backing_out_commit["backsout"])[ - "desc" - ] + commit = {} + + commit["desc"] = hg_client.get_revision( + "nightly", backing_out_commit["backsout"] + )["desc"] - revision_id = extract_revision_id(desc) + revision_id = repository.get_revision_id(commit) backed_out_bugs.append((bug, backing_out_commit, revision_id)) @@ -91,39 +94,68 @@ def find_backing_out_commit(commits, hg_client): return None -def extract_revision_id(desc): - match = re.search(r"https://phabricator\.services\.mozilla\.com/(D\d+)", desc) - if match: - return match.group(1) - return None - - def main(): download_databases() - bug_commits = preprocess_commits_and_bugs() - # rev_id_to_phid = preprocess_revisions() - - hg_client = Revision() - - bugs = find_bugs(bug_commits, hg_client) - - for bug in bugs: - print(bug[2]) + # bug_commits = preprocess_commits_and_bugs() + + # hg_client = Revision() + + # bugs = find_bugs(bug_commits, hg_client) + + # # for bug in bugs: + # # print(bug[2]) + + backout_revisions = [ + 27904, + 30744, + 128537, + 127218, + 153067, + 157855, + 161229, + 164203, + 173115, + 174921, + 174086, + 175742, + 20409, + 58102, + 91663, + 205936, + 178686, + 208953, + 211415, + 211106, + 89590, + 214412, + 216163, + 26390, + 219250, + 215371, + ] + + revisions_to_commits = defaultdict(list) + + for commit in repository.get_commits(): + revision_id = repository.get_revision_id(commit) + + if revision_id in backout_revisions: + revisions_to_commits[revision_id].append(commit["node"]) + + for revision_id, commits in revisions_to_commits.items(): + print(f"Revision: {revision_id}: {commits}") if __name__ == "__main__": - # main() - phabricator.set_api_key( - os.getenv("PHABRICATOR_URL", "default_url"), - os.getenv("PHABRICATOR_API_KEY", "default_key"), - ) - print(phabricator.get_transactions("D128537")) - + main() # collect bugs with build failures, along with a list of their revisions X # identify commit that backs out another commit due to build failure X # from above, we can also get the node of the commit that caused the backout X # extract the revision ID from initial commit description X + +# ---- Find another commit with the same revision mentioned in the commit description + # use this to associate the commit nodes to the diff IDS --> find the commit before and after the backout -- check diff description? # or alternatively, check when the reverting change was made (specifically for the build failure backout) --> get the diff before and after this timestamp with a desc with a commit to MOZILLACENTRAL From 1fce46494756bc1b3c83667669b67a29f843f3ea Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 12 Nov 2024 15:48:54 -0500 Subject: [PATCH 05/24] Added hg diff functions --- bugbug/repository.py | 63 ++++++++++++++++++++++++ scripts/build_failure_data_collection.py | 13 ++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/bugbug/repository.py b/bugbug/repository.py index 341115b1c7..07d58c0031 100644 --- a/bugbug/repository.py +++ b/bugbug/repository.py @@ -1543,6 +1543,69 @@ def trigger_pull() -> None: trigger_pull() +def get_diff(repo_path, original_hash, fix_hash) -> bytes: + client = hglib.open(repo_path) + + current_rev = client.identify(id=True) + + try: + client.rawcommand([b"shelve"]) + except hglib.error.CommandError as e: + if b"nothing changed" in e.out: + logger.info(f"Nothing to shelve: {e}") + else: + raise RuntimeError("Error occurred while shelving") from e + + parents = client.parents(rev=fix_hash) + parent_of_fix = parents[0][1] + client.update(rev=parent_of_fix, clean=True) + + graft_result = graft( + client, revs=[original_hash], no_commit=True, force=True, tool=":merge" + ) + + if not graft_result: + return b"" + + final_diff = client.diff( + revs=[fix_hash], ignoreallspace=True, ignorespacechange=True, reverse=True + ) + + client.update(rev=current_rev, clean=True) + + return final_diff + + +def graft(client, revs, no_commit=False, force=False, tool=":merge") -> bool: + """Graft changesets specified by revs into the current repository state. + + Args: + client: The hglib client. + revs: A list of the hashes of the commits to be applied to the current repository state. + no_commit: If True, does not commit and just applies changes in working directory. + force: If True, forces the grafts even if the revs are ancestors of the current repository state. + tool: A string representing a merge tool (see `hg help merge-tools`). + + Returns: + Boolean of graft operation result (True for success, False for failure). + """ + args = hglib.util.cmdbuilder( + str.encode("graft"), r=revs, no_commit=no_commit, f=force, tool=tool + ) + + eh = hglib.util.reterrorhandler(args) + + client.rawcommand(args, eh=eh, prompt=auto_resolve_conflict_prompt) + + return True + + +def auto_resolve_conflict_prompt(max_bytes, current_output): + if b"was deleted in" in current_output: + return b"c\n" # Return 'c' to use the changed version + return b"\n" # Default to doing nothing, just proceed + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("repository_dir", help="Path to the repository", action="store") diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 8a64acc607..36a93d8b84 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -10,6 +10,9 @@ def download_databases(): + logger.info("Cloning Mercurial database...") + repository.clone(repo_dir="hg_dir") + logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) @@ -144,7 +147,15 @@ def main(): revisions_to_commits[revision_id].append(commit["node"]) for revision_id, commits in revisions_to_commits.items(): - print(f"Revision: {revision_id}: {commits}") + commit_diff = repository.get_diff( + repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] + ) + if not commit_diff: + continue + + commit_diff_encoded = commit_diff.decode("utf-8") + + print(commit_diff_encoded) if __name__ == "__main__": From 6a9830c0cb93e2dfda51fdaaac197cf50cba41b0 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 13 Nov 2024 10:15:59 -0500 Subject: [PATCH 06/24] CSV creation --- scripts/build_failure_data_collection.py | 34 ++++++++++++++++++------ 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 36a93d8b84..a337367fb1 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -1,3 +1,4 @@ +import csv import logging from collections import defaultdict @@ -146,16 +147,33 @@ def main(): if revision_id in backout_revisions: revisions_to_commits[revision_id].append(commit["node"]) - for revision_id, commits in revisions_to_commits.items(): - commit_diff = repository.get_diff( - repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] - ) - if not commit_diff: - continue + with open("revisions.csv", mode="w", newline="", encoding="utf-8") as file: + writer = csv.writer(file) + + writer.writerow(["Revision ID", "Initial Commit", "Fix Commit", "Interdiff"]) + + for revision_id, commits in revisions_to_commits.items(): + commit_diff = repository.get_diff( + repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] + ) + if not commit_diff: + continue + + commit_diff_encoded = commit_diff.decode("utf-8") + + writer.writerow([revision_id, commits[0], commits[1], commit_diff_encoded]) - commit_diff_encoded = commit_diff.decode("utf-8") + # for revision_id, commits in revisions_to_commits.items(): + # commit_diff = repository.get_diff( + # repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] + # ) + # if not commit_diff: + # continue - print(commit_diff_encoded) + # commit_diff_encoded = commit_diff.decode("utf-8") + # print(f"Revision ID: {revision_id}") + # print(commit_diff_encoded) + # print("=====================================") if __name__ == "__main__": From 2287975cbc85f2d9ec5c1a40ea2f039caeae1667 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 18 Nov 2024 10:47:40 -0500 Subject: [PATCH 07/24] Fixed the revision collection script --- scripts/build_failure_data_collection.py | 75 ++++++++++++------------ 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index a337367fb1..4188bf5f80 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -6,13 +6,15 @@ from bugbug import bugzilla, db, phabricator, repository +# from libmozdata.hgmozilla import Revision + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def download_databases(): logger.info("Cloning Mercurial database...") - repository.clone(repo_dir="hg_dir") + assert repository.clone(repo_dir="hg_dir") logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) @@ -79,7 +81,11 @@ def find_bugs(bug_commits, hg_client): def caused_build_failure(comments): for comment in comments: - if "backed out" in comment["text"] and "build" in comment["text"]: + if ( + "backed out" in comment["text"] + and "for causing" in comment["text"] + and "build" in comment["text"] + ): return True return False @@ -107,38 +113,47 @@ def main(): # bugs = find_bugs(bug_commits, hg_client) - # # for bug in bugs: - # # print(bug[2]) + # for bug in bugs: + # print(bug[2]) + + # backout_revisions = [ + # 27904, + # 30744, + # 128537, + # 127218, + # 153067, + # 157855, + # 161229, + # 164203, + # 173115, + # 174921, + # 174086, + # 175742, + # 20409, + # 58102, + # 91663, + # 205936, + # 178686, + # 208953, + # 211415, + # 211106, + # 89590, + # 214412, + # 216163, + # 26390, + # 219250, + # 215371, + # ] backout_revisions = [ - 27904, - 30744, - 128537, - 127218, 153067, 157855, - 161229, 164203, - 173115, - 174921, - 174086, - 175742, - 20409, - 58102, - 91663, - 205936, 178686, 208953, - 211415, - 211106, - 89590, - 214412, 216163, - 26390, - 219250, 215371, ] - revisions_to_commits = defaultdict(list) for commit in repository.get_commits(): @@ -163,18 +178,6 @@ def main(): writer.writerow([revision_id, commits[0], commits[1], commit_diff_encoded]) - # for revision_id, commits in revisions_to_commits.items(): - # commit_diff = repository.get_diff( - # repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] - # ) - # if not commit_diff: - # continue - - # commit_diff_encoded = commit_diff.decode("utf-8") - # print(f"Revision ID: {revision_id}") - # print(commit_diff_encoded) - # print("=====================================") - if __name__ == "__main__": main() From 2ae3029c8b2dc09576fa173107cb6ff5b3c806d4 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Mon, 18 Nov 2024 10:51:31 -0500 Subject: [PATCH 08/24] Added matrix message reference for log collection --- scripts/build_failure_data_collection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 4188bf5f80..41e1fd3a72 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -192,7 +192,8 @@ def main(): # or alternatively, check when the reverting change was made (specifically for the build failure backout) --> get the diff before and after this timestamp with a desc with a commit to MOZILLACENTRAL # once we have the initial and fix patch IDs, we can get the interdiff between them -# we can also get the error message from the initial patch, to find the exact lines .... +# we can also get the error message from the initial patch, to find the exact lines +# take a look here https://matrix.to/#/!whDRjjSmICCgrhFHsQ:mozilla.org/$H93f5S5LisVMCEeM2-oB97mHXz6usNAJjWAMUSqQEQc?via=mozilla.org&via=matrix.org&via=braak.pro # find the commit that happened most recently after the backout --> this is a fix commit From 9a298aa569cdac0558d37de7330f6b594a0ef611 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 19 Nov 2024 09:28:24 -0500 Subject: [PATCH 09/24] Fixed assertion error --- scripts/build_failure_data_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 41e1fd3a72..6a959ac2ae 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -14,7 +14,7 @@ def download_databases(): logger.info("Cloning Mercurial database...") - assert repository.clone(repo_dir="hg_dir") + repository.clone(repo_dir="hg_dir") logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) From df779ea3660d56e4ee836a34bf8f087c4d35e239 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 20 Nov 2024 15:05:55 -0500 Subject: [PATCH 10/24] Added TC API error line search --- scripts/build_failure_data_collection.py | 82 +++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 6a959ac2ae..bf624e552e 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -2,6 +2,8 @@ import logging from collections import defaultdict +import requests +import taskcluster from tqdm import tqdm from bugbug import bugzilla, db, phabricator, repository @@ -180,7 +182,85 @@ def main(): if __name__ == "__main__": - main() + # main() + + tc_test = True + + if tc_test: + import taskcluster + + index = taskcluster.Index( + { + "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, + } + ) + + queue = taskcluster.Queue( + { + "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + } + ) + + # FINAL STEPS + # 1. list the tasks + # tasks = index.listTasks('gecko.v2.autoland.revision.04d0c38e624dc5fe830b67c0526aafa87d3a63ed.firefox') + commit_node = "448597bce69d9e173e0b6818a513b8dfd86f1765" + commit_node = "04d0c38e624dc5fe830b67c0526aafa87d3a63ed" + tasks = index.listTasks(f"gecko.v2.autoland.revision.{commit_node}.firefox") + print(tasks) + + # 2. get the task ID from one of the tasks (I think any is fine) + first_task_id = tasks["tasks"][0]["taskId"] + print(first_task_id) + + # 3. get the task group ID from the task ID + first_task = queue.task(first_task_id) + task_group_id = first_task["taskGroupId"] + print(task_group_id) + + # 4. extract the build task IDs from the task group ID + # "https://firefoxci.taskcluster-artifacts.net/YHg9SxOFQiKgWd4Cr9TuRw/0/public/label-to-taskid.json" + url = f"https://firefoxci.taskcluster-artifacts.net/{task_group_id}/0/public/label-to-taskid.json" + response = requests.get(url) + response.raise_for_status() + data = response.json() + + build_tasks = set() + + for label, taskId in data.items(): + if label[:5] == "build": + build_tasks.add(taskId) + + # 5 get failed tasks + failed_tasks = set() + + for task in queue.listTaskGroup(task_group_id)["tasks"]: + if task["status"]["state"] == "failed": + failed_tasks.add(task["status"]["taskId"]) + + # 6. find intersection between build tasks and failed tasks + failed_build_tasks = list(build_tasks & failed_tasks) + print(failed_build_tasks) + + # 7. get the url to access the log, load it, and extract the ERROR lines + for failed_build_task in failed_build_tasks: + artifact = queue.getArtifact( + taskId=failed_build_task, runId="0", name="public/logs/live.log" + ) + print(artifact) + url = artifact["url"] + break + + response = requests.get(url) + error_lines = [line for line in response.text.split("\n") if "ERROR - " in line] + + for error_line in error_lines: + print(error_line) + + # print(queue.listTaskGroup('04d0c38e624dc5fe830b67c0526aafa87d3a63ed')) + # print(queue.listTaskGroup('eNsrB5djSb6UZipZi3BPAQ')) + # collect bugs with build failures, along with a list of their revisions X # identify commit that backs out another commit due to build failure X # from above, we can also get the node of the commit that caused the backout X From f8376352868afc9f9f29740400e7c735676138c4 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 20 Nov 2024 16:04:41 -0500 Subject: [PATCH 11/24] Added error line retrieval in dataset creation --- scripts/build_failure_data_collection.py | 242 ++++++++++++++++------- 1 file changed, 166 insertions(+), 76 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index bf624e552e..ead5209315 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -92,6 +92,60 @@ def caused_build_failure(comments): return False +def find_error_lines(index_client, queue_client, commit_node): + # FINAL STEPS + # 1. list the tasks + tasks = index_client.listTasks(f"gecko.v2.autoland.revision.{commit_node}.firefox") + + if not tasks["tasks"]: + return [] + + # 2. get the task ID from one of the tasks (I think any is fine) + first_task_id = tasks["tasks"][0]["taskId"] + + # 3. get the task group ID from the task ID + first_task = queue_client.task(first_task_id) + task_group_id = first_task["taskGroupId"] + + # 4. extract the build task IDs from the task group ID + url = f"https://firefoxci.taskcluster-artifacts.net/{task_group_id}/0/public/label-to-taskid.json" + response = requests.get(url) + response.raise_for_status() + data = response.json() + + build_tasks = set() + + for label, taskId in data.items(): + if label[:5] == "build": + build_tasks.add(taskId) + + # 5 get failed tasks + failed_tasks = set() + + for task in queue_client.listTaskGroup(task_group_id)["tasks"]: + if task["status"]["state"] == "failed": + failed_tasks.add(task["status"]["taskId"]) + + # 6. find intersection between build tasks and failed tasks + failed_build_tasks = list(build_tasks & failed_tasks) + + # 7. get the url to access the log, load it, and extract the ERROR lines + error_lines = [] + + for failed_build_task in failed_build_tasks: + artifact = queue_client.getArtifact( + taskId=failed_build_task, runId="0", name="public/logs/live.log" + ) + url = artifact["url"] + + response = requests.get(url) + error_lines.extend( + [line for line in response.text.split("\n") if "ERROR - " in line] + ) + + return error_lines + + def find_backing_out_commit(commits, hg_client): if not commits: return None @@ -147,6 +201,19 @@ def main(): # 215371, # ] + index = taskcluster.Index( + { + "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, + } + ) + + queue = taskcluster.Queue( + { + "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + } + ) + backout_revisions = [ 153067, 157855, @@ -167,7 +234,9 @@ def main(): with open("revisions.csv", mode="w", newline="", encoding="utf-8") as file: writer = csv.writer(file) - writer.writerow(["Revision ID", "Initial Commit", "Fix Commit", "Interdiff"]) + writer.writerow( + ["Revision ID", "Initial Commit", "Fix Commit", "Interdiff", "Error Lines"] + ) for revision_id, commits in revisions_to_commits.items(): commit_diff = repository.get_diff( @@ -178,85 +247,106 @@ def main(): commit_diff_encoded = commit_diff.decode("utf-8") - writer.writerow([revision_id, commits[0], commits[1], commit_diff_encoded]) - - -if __name__ == "__main__": - # main() - - tc_test = True - - if tc_test: - import taskcluster - - index = taskcluster.Index( - { - "rootUrl": "https://firefox-ci-tc.services.mozilla.com", - "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, - } - ) - - queue = taskcluster.Queue( - { - "rootUrl": "https://firefox-ci-tc.services.mozilla.com", - } - ) - - # FINAL STEPS - # 1. list the tasks - # tasks = index.listTasks('gecko.v2.autoland.revision.04d0c38e624dc5fe830b67c0526aafa87d3a63ed.firefox') - commit_node = "448597bce69d9e173e0b6818a513b8dfd86f1765" - commit_node = "04d0c38e624dc5fe830b67c0526aafa87d3a63ed" - tasks = index.listTasks(f"gecko.v2.autoland.revision.{commit_node}.firefox") - print(tasks) - - # 2. get the task ID from one of the tasks (I think any is fine) - first_task_id = tasks["tasks"][0]["taskId"] - print(first_task_id) - - # 3. get the task group ID from the task ID - first_task = queue.task(first_task_id) - task_group_id = first_task["taskGroupId"] - print(task_group_id) - - # 4. extract the build task IDs from the task group ID - # "https://firefoxci.taskcluster-artifacts.net/YHg9SxOFQiKgWd4Cr9TuRw/0/public/label-to-taskid.json" - url = f"https://firefoxci.taskcluster-artifacts.net/{task_group_id}/0/public/label-to-taskid.json" - response = requests.get(url) - response.raise_for_status() - data = response.json() - - build_tasks = set() - - for label, taskId in data.items(): - if label[:5] == "build": - build_tasks.add(taskId) + error_lines = find_error_lines(index, queue, commits[0]) - # 5 get failed tasks - failed_tasks = set() - - for task in queue.listTaskGroup(task_group_id)["tasks"]: - if task["status"]["state"] == "failed": - failed_tasks.add(task["status"]["taskId"]) - - # 6. find intersection between build tasks and failed tasks - failed_build_tasks = list(build_tasks & failed_tasks) - print(failed_build_tasks) - - # 7. get the url to access the log, load it, and extract the ERROR lines - for failed_build_task in failed_build_tasks: - artifact = queue.getArtifact( - taskId=failed_build_task, runId="0", name="public/logs/live.log" + writer.writerow( + [revision_id, commits[0], commits[1], commit_diff_encoded, error_lines] ) - print(artifact) - url = artifact["url"] - break - response = requests.get(url) - error_lines = [line for line in response.text.split("\n") if "ERROR - " in line] - for error_line in error_lines: - print(error_line) +if __name__ == "__main__": + main() + + # # ==== experimental purposes ==== + + # index = taskcluster.Index( + # { + # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + # "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, + # } + # ) + + # queue = taskcluster.Queue( + # { + # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + # } + # ) + + # print(find_error_lines(index, queue, "448597bce69d9e173e0b6818a513b8dfd86f1765")) + + # tc_test = True + + # if tc_test: + + # index = taskcluster.Index( + # { + # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + # "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, + # } + # ) + + # queue = taskcluster.Queue( + # { + # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + # } + # ) + + # # FINAL STEPS + # # 1. list the tasks + # # tasks = index.listTasks('gecko.v2.autoland.revision.04d0c38e624dc5fe830b67c0526aafa87d3a63ed.firefox') + # commit_node = "448597bce69d9e173e0b6818a513b8dfd86f1765" + # commit_node = "04d0c38e624dc5fe830b67c0526aafa87d3a63ed" + # commit_node = "759d4948ed8b468dfc03d2ca35e7c8e54b62ae75" + # tasks = index.listTasks(f"gecko.v2.autoland.revision.{commit_node}.firefox") + # print(tasks) + + # # 2. get the task ID from one of the tasks (I think any is fine) + # first_task_id = tasks["tasks"][0]["taskId"] + # print(first_task_id) + + # # 3. get the task group ID from the task ID + # first_task = queue.task(first_task_id) + # task_group_id = first_task["taskGroupId"] + # print(task_group_id) + + # # 4. extract the build task IDs from the task group ID + # # "https://firefoxci.taskcluster-artifacts.net/YHg9SxOFQiKgWd4Cr9TuRw/0/public/label-to-taskid.json" + # url = f"https://firefoxci.taskcluster-artifacts.net/{task_group_id}/0/public/label-to-taskid.json" + # response = requests.get(url) + # response.raise_for_status() + # data = response.json() + + # build_tasks = set() + + # for label, taskId in data.items(): + # if label[:5] == "build": + # build_tasks.add(taskId) + + # # 5 get failed tasks + # failed_tasks = set() + + # for task in queue.listTaskGroup(task_group_id)["tasks"]: + # if task["status"]["state"] == "failed": + # failed_tasks.add(task["status"]["taskId"]) + + # # 6. find intersection between build tasks and failed tasks + # failed_build_tasks = list(build_tasks & failed_tasks) + # print(failed_build_tasks) + + # # 7. get the url to access the log, load it, and extract the ERROR lines + # for failed_build_task in failed_build_tasks: + # artifact = queue.getArtifact( + # taskId=failed_build_task, runId="0", name="public/logs/live.log" + # ) + # print(artifact) + # url = artifact["url"] + # break + + # response = requests.get(url) + # error_lines = [line for line in response.text.split("\n") if "ERROR - " in line] + + # for error_line in error_lines: + # print(error_line) # print(queue.listTaskGroup('04d0c38e624dc5fe830b67c0526aafa87d3a63ed')) # print(queue.listTaskGroup('eNsrB5djSb6UZipZi3BPAQ')) From dc5a83a461a48c88ab6cc77f478094e94155cea6 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 20 Nov 2024 16:09:43 -0500 Subject: [PATCH 12/24] Replaced client ID with environment variable --- scripts/build_failure_data_collection.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index ead5209315..b082a8b2cf 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -1,5 +1,6 @@ import csv import logging +import os from collections import defaultdict import requests @@ -201,10 +202,12 @@ def main(): # 215371, # ] + client_id = os.getenv("TC_CLIENT_ID") + index = taskcluster.Index( { "rootUrl": "https://firefox-ci-tc.services.mozilla.com", - "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, + "credentials": {"clientId": client_id}, } ) From bbfd499e2ab0edc8ef728490420ed9595b9c24c1 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 20 Nov 2024 16:54:42 -0500 Subject: [PATCH 13/24] Fixed comments --- scripts/build_failure_data_collection.py | 76 ++++++++++++------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index b082a8b2cf..803fb812ba 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -120,7 +120,7 @@ def find_error_lines(index_client, queue_client, commit_node): if label[:5] == "build": build_tasks.add(taskId) - # 5 get failed tasks + # 5. get failed tasks failed_tasks = set() for task in queue_client.listTaskGroup(task_group_id)["tasks"]: @@ -173,34 +173,34 @@ def main(): # for bug in bugs: # print(bug[2]) - # backout_revisions = [ - # 27904, - # 30744, - # 128537, - # 127218, - # 153067, - # 157855, - # 161229, - # 164203, - # 173115, - # 174921, - # 174086, - # 175742, - # 20409, - # 58102, - # 91663, - # 205936, - # 178686, - # 208953, - # 211415, - # 211106, - # 89590, - # 214412, - # 216163, - # 26390, - # 219250, - # 215371, - # ] + backout_revisions = [ + 27904, + 30744, + 128537, + 127218, + 153067, + 157855, + 161229, + 164203, + 173115, + 174921, + 174086, + 175742, + 20409, + 58102, + 91663, + 205936, + 178686, + 208953, + 211415, + 211106, + 89590, + 214412, + 216163, + 26390, + 219250, + 215371, + ] client_id = os.getenv("TC_CLIENT_ID") @@ -217,15 +217,15 @@ def main(): } ) - backout_revisions = [ - 153067, - 157855, - 164203, - 178686, - 208953, - 216163, - 215371, - ] + # backout_revisions = [ + # 153067, + # 157855, + # 164203, + # 178686, + # 208953, + # 216163, + # 215371, + # ] revisions_to_commits = defaultdict(list) for commit in repository.get_commits(): From 5f4c2afe1de38fb1760bce67221750c81eebeb63 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 26 Nov 2024 12:51:32 -0500 Subject: [PATCH 14/24] Uncommented revision finder --- scripts/build_failure_data_collection.py | 79 ++++++++++++------------ 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 803fb812ba..d55100952d 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -5,6 +5,7 @@ import requests import taskcluster +from libmozdata.hgmozilla import Revision from tqdm import tqdm from bugbug import bugzilla, db, phabricator, repository @@ -86,8 +87,9 @@ def caused_build_failure(comments): for comment in comments: if ( "backed out" in comment["text"] - and "for causing" in comment["text"] + # and "for causing" in comment["text"] and "build" in comment["text"] + and "bustages" in comment["text"] ): return True return False @@ -164,43 +166,44 @@ def find_backing_out_commit(commits, hg_client): def main(): download_databases() - # bug_commits = preprocess_commits_and_bugs() - - # hg_client = Revision() - - # bugs = find_bugs(bug_commits, hg_client) - - # for bug in bugs: - # print(bug[2]) - - backout_revisions = [ - 27904, - 30744, - 128537, - 127218, - 153067, - 157855, - 161229, - 164203, - 173115, - 174921, - 174086, - 175742, - 20409, - 58102, - 91663, - 205936, - 178686, - 208953, - 211415, - 211106, - 89590, - 214412, - 216163, - 26390, - 219250, - 215371, - ] + bug_commits = preprocess_commits_and_bugs() + + hg_client = Revision() + + bugs = find_bugs(bug_commits, hg_client) + + backout_revisions = [] + for bug in bugs: + backout_revisions.append(bug[2]) + + # backout_revisions = [ + # 27904, + # 30744, + # 128537, + # 127218, + # 153067, + # 157855, + # 161229, + # 164203, + # 173115, + # 174921, + # 174086, + # 175742, + # 20409, + # 58102, + # 91663, + # 205936, + # 178686, + # 208953, + # 211415, + # 211106, + # 89590, + # 214412, + # 216163, + # 26390, + # 219250, + # 215371, + # ] client_id = os.getenv("TC_CLIENT_ID") From 529067c1eb4fce812aff5094cb93d15b3cf307c3 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Fri, 29 Nov 2024 13:21:21 -0500 Subject: [PATCH 15/24] Refactored code --- scripts/build_failure_data_collection.py | 321 ++++++++--------------- 1 file changed, 108 insertions(+), 213 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index d55100952d..80115462f4 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -2,16 +2,17 @@ import logging import os from collections import defaultdict +from datetime import datetime import requests import taskcluster +from dateutil.relativedelta import relativedelta +from libmozdata.bugzilla import Bugzilla from libmozdata.hgmozilla import Revision from tqdm import tqdm from bugbug import bugzilla, db, phabricator, repository -# from libmozdata.hgmozilla import Revision - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -30,8 +31,40 @@ def download_databases(): assert db.download(phabricator.REVISIONS_DB, support_files_too=True) -def preprocess_commits_and_bugs(): - logger.info("Preprocessing commits and bugs...") +def get_bz_params(): + fields = ["id"] + one_year_ago = (datetime.now() - relativedelta(years=1)).strftime("%Y-%m-%d") + params = { + "include_fields": fields, + "resolution": "---", + "f1": "creation_ts", + "o1": "greaterthan", + "v1": one_year_ago, + "f2": "longdesc", + "o2": "allwordssubstr", + "v2": "backed out for causing build", + } + return params + + +def get_backed_out_build_failure_bugs(date="today", bug_ids=[], chunk_size=None): + params = get_bz_params() + bugs = {} + + def bug_handler(bug, data): + data[bug["id"]] = bug + + Bugzilla( + params, + bughandler=bug_handler, + bugdata=bugs, + ).get_data().wait() + + return bugs + + +def map_bugs_to_commit(bug_ids): + logger.info("Mapping bugs to their commits...") bug_commits = {} for commit in tqdm( @@ -39,9 +72,12 @@ def preprocess_commits_and_bugs(): include_no_bug=True, include_backouts=True, include_ignored=True ) ): + if commit["bug_id"] not in bug_ids: + continue + commit_data = { key: commit[key] - for key in ["node", "bug_id", "pushdate", "backedoutby", "backsout"] + for key in ["node", "bug_id", "pushdate", "backedoutby", "backsout", "desc"] } bug_commits.setdefault(commit["bug_id"], []).append(commit_data) @@ -49,50 +85,59 @@ def preprocess_commits_and_bugs(): return bug_commits -def preprocess_revisions(): - logger.info("Preprocessing revisions...") - diff_id_to_phid = {} +def find_bugs(hg_client, bug_ids, bug_commits): + logger.info("Finding bugs...") + backed_out_revisions = [] - for revision in phabricator.get_revisions(): - diff_id_to_phid[revision["id"]] = revision["phid"] + for bug_id in bug_ids: + bug_id_commits = bug_commits.get(bug_id, None) + backing_out_commit = find_backing_out_commit(bug_id_commits, hg_client) - return diff_id_to_phid + if not backing_out_commit: + continue + logger.info("Backing out commit found!") -def find_bugs(bug_commits, hg_client): - backed_out_bugs = [] + commit = {} - for bug in bugzilla.get_bugs(include_invalid=True): - if caused_build_failure(bug["comments"]): - backing_out_commit = find_backing_out_commit( - bug_commits.get(bug["id"], None), hg_client - ) - if not backing_out_commit: - continue + commit["desc"] = next( + ( + c["desc"] + for c in bug_id_commits + if any( + c["node"].startswith(node) + for node in backing_out_commit["backsout"] + ) + ), + None, + ) + if commit["desc"] is None: + continue - commit = {} + revision_id = repository.get_revision_id(commit) - commit["desc"] = hg_client.get_revision( - "nightly", backing_out_commit["backsout"] - )["desc"] + backed_out_revisions.append(revision_id) - revision_id = repository.get_revision_id(commit) + return backed_out_revisions - backed_out_bugs.append((bug, backing_out_commit, revision_id)) - return backed_out_bugs +def find_backing_out_commit(commits, hg_client): + logger.info("Finding backing out commit...") + if not commits: + return None + for commit in commits: + if not commit["backsout"]: + continue -def caused_build_failure(comments): - for comment in comments: + desc = commit["desc"] if ( - "backed out" in comment["text"] - # and "for causing" in comment["text"] - and "build" in comment["text"] - and "bustages" in comment["text"] + "backed out" in desc.lower() + and "for causing" in desc.lower() + and "build" in desc.lower() ): - return True - return False + return commit + return None def find_error_lines(index_client, queue_client, commit_node): @@ -149,61 +194,31 @@ def find_error_lines(index_client, queue_client, commit_node): return error_lines -def find_backing_out_commit(commits, hg_client): - if not commits: - return None +def main(): + # 0. + download_databases() - for commit in commits: - if not commit["backsout"]: - continue + # 1. + bugs = get_backed_out_build_failure_bugs() + bug_ids = list(bugs.keys()) - desc = hg_client.get_revision("nightly", commit["node"])["desc"] - if "backed out" in desc.lower() and "build" in desc.lower(): - return commit - return None + # 2. + bug_commits = map_bugs_to_commit(bug_ids) + # 3. + hg_client = Revision() + backed_out_revisions = find_bugs(hg_client, bug_ids, bug_commits) -def main(): - download_databases() + # 4. + revisions_to_commits = defaultdict(list) - bug_commits = preprocess_commits_and_bugs() + for commit in repository.get_commits(): + revision_id = repository.get_revision_id(commit) - hg_client = Revision() + if revision_id in backed_out_revisions: + revisions_to_commits[revision_id].append(commit["node"]) - bugs = find_bugs(bug_commits, hg_client) - - backout_revisions = [] - for bug in bugs: - backout_revisions.append(bug[2]) - - # backout_revisions = [ - # 27904, - # 30744, - # 128537, - # 127218, - # 153067, - # 157855, - # 161229, - # 164203, - # 173115, - # 174921, - # 174086, - # 175742, - # 20409, - # 58102, - # 91663, - # 205936, - # 178686, - # 208953, - # 211415, - # 211106, - # 89590, - # 214412, - # 216163, - # 26390, - # 219250, - # 215371, - # ] + # 5. and 6. client_id = os.getenv("TC_CLIENT_ID") @@ -220,23 +235,6 @@ def main(): } ) - # backout_revisions = [ - # 153067, - # 157855, - # 164203, - # 178686, - # 208953, - # 216163, - # 215371, - # ] - revisions_to_commits = defaultdict(list) - - for commit in repository.get_commits(): - revision_id = repository.get_revision_id(commit) - - if revision_id in backout_revisions: - revisions_to_commits[revision_id].append(commit["node"]) - with open("revisions.csv", mode="w", newline="", encoding="utf-8") as file: writer = csv.writer(file) @@ -245,6 +243,9 @@ def main(): ) for revision_id, commits in revisions_to_commits.items(): + if len(commits) < 2: + continue + commit_diff = repository.get_diff( repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] ) @@ -263,116 +264,10 @@ def main(): if __name__ == "__main__": main() - # # ==== experimental purposes ==== - - # index = taskcluster.Index( - # { - # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", - # "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, - # } - # ) - - # queue = taskcluster.Queue( - # { - # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", - # } - # ) - - # print(find_error_lines(index, queue, "448597bce69d9e173e0b6818a513b8dfd86f1765")) - - # tc_test = True - - # if tc_test: - - # index = taskcluster.Index( - # { - # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", - # "credentials": {"clientId": "mozilla-auth0/ad|Mozilla-LDAP|bmah"}, - # } - # ) - - # queue = taskcluster.Queue( - # { - # "rootUrl": "https://firefox-ci-tc.services.mozilla.com", - # } - # ) - - # # FINAL STEPS - # # 1. list the tasks - # # tasks = index.listTasks('gecko.v2.autoland.revision.04d0c38e624dc5fe830b67c0526aafa87d3a63ed.firefox') - # commit_node = "448597bce69d9e173e0b6818a513b8dfd86f1765" - # commit_node = "04d0c38e624dc5fe830b67c0526aafa87d3a63ed" - # commit_node = "759d4948ed8b468dfc03d2ca35e7c8e54b62ae75" - # tasks = index.listTasks(f"gecko.v2.autoland.revision.{commit_node}.firefox") - # print(tasks) - - # # 2. get the task ID from one of the tasks (I think any is fine) - # first_task_id = tasks["tasks"][0]["taskId"] - # print(first_task_id) - - # # 3. get the task group ID from the task ID - # first_task = queue.task(first_task_id) - # task_group_id = first_task["taskGroupId"] - # print(task_group_id) - - # # 4. extract the build task IDs from the task group ID - # # "https://firefoxci.taskcluster-artifacts.net/YHg9SxOFQiKgWd4Cr9TuRw/0/public/label-to-taskid.json" - # url = f"https://firefoxci.taskcluster-artifacts.net/{task_group_id}/0/public/label-to-taskid.json" - # response = requests.get(url) - # response.raise_for_status() - # data = response.json() - - # build_tasks = set() - - # for label, taskId in data.items(): - # if label[:5] == "build": - # build_tasks.add(taskId) - - # # 5 get failed tasks - # failed_tasks = set() - - # for task in queue.listTaskGroup(task_group_id)["tasks"]: - # if task["status"]["state"] == "failed": - # failed_tasks.add(task["status"]["taskId"]) - - # # 6. find intersection between build tasks and failed tasks - # failed_build_tasks = list(build_tasks & failed_tasks) - # print(failed_build_tasks) - - # # 7. get the url to access the log, load it, and extract the ERROR lines - # for failed_build_task in failed_build_tasks: - # artifact = queue.getArtifact( - # taskId=failed_build_task, runId="0", name="public/logs/live.log" - # ) - # print(artifact) - # url = artifact["url"] - # break - - # response = requests.get(url) - # error_lines = [line for line in response.text.split("\n") if "ERROR - " in line] - - # for error_line in error_lines: - # print(error_line) - - # print(queue.listTaskGroup('04d0c38e624dc5fe830b67c0526aafa87d3a63ed')) - # print(queue.listTaskGroup('eNsrB5djSb6UZipZi3BPAQ')) - -# collect bugs with build failures, along with a list of their revisions X -# identify commit that backs out another commit due to build failure X -# from above, we can also get the node of the commit that caused the backout X -# extract the revision ID from initial commit description X - -# ---- Find another commit with the same revision mentioned in the commit description - -# use this to associate the commit nodes to the diff IDS --> find the commit before and after the backout -- check diff description? -# or alternatively, check when the reverting change was made (specifically for the build failure backout) --> get the diff before and after this timestamp with a desc with a commit to MOZILLACENTRAL - -# once we have the initial and fix patch IDs, we can get the interdiff between them -# we can also get the error message from the initial patch, to find the exact lines -# take a look here https://matrix.to/#/!whDRjjSmICCgrhFHsQ:mozilla.org/$H93f5S5LisVMCEeM2-oB97mHXz6usNAJjWAMUSqQEQc?via=mozilla.org&via=matrix.org&via=braak.pro - - -# find the commit that happened most recently after the backout --> this is a fix commit -# convert commit id to commit phid (this is a thing in the revision object in phabricator.get_revisions) -# associate commit phid with its revision in phab --> and then associate it with its patch id -# get interdiff between the initial patch and the fix patch +# 0. Download databases +# 1. Identify bugs in Bugzilla that have a backout due to build failures X +# 2. Map only these bugs' commits to the bug ID in a dict +# 3. Find the revision from the bug +# 4. Map the revision to the commits +# 5. Get the interdiff +# 6. Find error lines in the interdiff From dc4544e3257efa9dbbc53dde2b86f9fc7ef5033f Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 3 Dec 2024 14:15:08 -0500 Subject: [PATCH 16/24] Included all revisions of a push --- scripts/build_failure_data_collection.py | 65 ++++++++++++++++-------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 80115462f4..a3127782c6 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -98,25 +98,39 @@ def find_bugs(hg_client, bug_ids, bug_commits): logger.info("Backing out commit found!") - commit = {} - - commit["desc"] = next( - ( - c["desc"] - for c in bug_id_commits - if any( - c["node"].startswith(node) - for node in backing_out_commit["backsout"] - ) - ), - None, - ) - if commit["desc"] is None: - continue + # commit = {} + + # commit["desc"] = next( + # ( + # c["desc"] + # for c in bug_id_commits + # if any( + # c["node"].startswith(node) + # for node in backing_out_commit["backsout"] + # ) + # ), + # None, + # ) + + # if commit["desc"] is None: + # continue + + commits = [ + { + "desc": c["desc"], + } + for c in bug_id_commits + if any( + c["node"].startswith(node) for node in backing_out_commit["backsout"] + ) + ] - revision_id = repository.get_revision_id(commit) + if commits is None: + continue - backed_out_revisions.append(revision_id) + for commit in commits: + revision_id = repository.get_revision_id(commit) + backed_out_revisions.append(revision_id) return backed_out_revisions @@ -246,15 +260,22 @@ def main(): if len(commits) < 2: continue + for commit in commits: + error_lines = find_error_lines(index, queue, commit) + + if error_lines: + break + + # if not error_lines: + # continue + commit_diff = repository.get_diff( repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] ) - if not commit_diff: - continue - - commit_diff_encoded = commit_diff.decode("utf-8") + # if not commit_diff: + # continue - error_lines = find_error_lines(index, queue, commits[0]) + commit_diff_encoded = commit_diff.decode("utf-8", errors="replace") writer.writerow( [revision_id, commits[0], commits[1], commit_diff_encoded, error_lines] From 460e66994e6ae0ea57b279ff8d5f6736aa2fba94 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Thu, 23 Jan 2025 10:00:51 -0500 Subject: [PATCH 17/24] Added commit information when identifying backing out commit --- scripts/build_failure_data_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index a3127782c6..88f2c2edad 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -96,7 +96,7 @@ def find_bugs(hg_client, bug_ids, bug_commits): if not backing_out_commit: continue - logger.info("Backing out commit found!") + logger.info(f"Backing out commit found for bug {bug_id}: {backing_out_commit}") # commit = {} From 2131fe649037144b292d18b4a9f7cc72b129d4fd Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Thu, 23 Jan 2025 10:56:53 -0500 Subject: [PATCH 18/24] Skipping bugs with multiple backouts --- scripts/build_failure_data_collection.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 88f2c2edad..afce865795 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -2,11 +2,9 @@ import logging import os from collections import defaultdict -from datetime import datetime import requests import taskcluster -from dateutil.relativedelta import relativedelta from libmozdata.bugzilla import Bugzilla from libmozdata.hgmozilla import Revision from tqdm import tqdm @@ -33,16 +31,16 @@ def download_databases(): def get_bz_params(): fields = ["id"] - one_year_ago = (datetime.now() - relativedelta(years=1)).strftime("%Y-%m-%d") + # one_year_ago = (datetime.now() - relativedelta(years=1)).strftime("%Y-%m-%d") params = { "include_fields": fields, "resolution": "---", - "f1": "creation_ts", - "o1": "greaterthan", - "v1": one_year_ago, - "f2": "longdesc", - "o2": "allwordssubstr", - "v2": "backed out for causing build", + # "f1": "creation_ts", + # "o1": "greaterthan", + # "v1": one_year_ago, + "f1": "longdesc", + "o1": "allwordssubstr", + "v1": "backed out for causing build", } return params @@ -140,6 +138,11 @@ def find_backing_out_commit(commits, hg_client): if not commits: return None + backout_commits = [commit for commit in commits if commit["backsout"]] + if len(backout_commits) > 1: + logger.info("Multiple backouts detected, skipping this bug.") + return None + for commit in commits: if not commit["backsout"]: continue From 3d21373d6a670968851fb83201524c49c4c76c66 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 22 Apr 2025 10:58:25 -0400 Subject: [PATCH 19/24] Changed the commit access to the last commit, to handle cases where there are more than two commits --- scripts/build_failure_data_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index afce865795..851965ee32 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -273,7 +273,7 @@ def main(): # continue commit_diff = repository.get_diff( - repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[1] + repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[-1] ) # if not commit_diff: # continue From f81879d0d2f17090071e519a757ed962e5abe7cb Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 23 Apr 2025 15:08:17 -0400 Subject: [PATCH 20/24] Removed comments --- scripts/build_failure_data_collection.py | 33 ++++++++---------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 851965ee32..788ff36a53 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -96,23 +96,6 @@ def find_bugs(hg_client, bug_ids, bug_commits): logger.info(f"Backing out commit found for bug {bug_id}: {backing_out_commit}") - # commit = {} - - # commit["desc"] = next( - # ( - # c["desc"] - # for c in bug_id_commits - # if any( - # c["node"].startswith(node) - # for node in backing_out_commit["backsout"] - # ) - # ), - # None, - # ) - - # if commit["desc"] is None: - # continue - commits = [ { "desc": c["desc"], @@ -219,13 +202,19 @@ def main(): bugs = get_backed_out_build_failure_bugs() bug_ids = list(bugs.keys()) + print(f"NUMBER OF BUGS FOUND THAT HAVE A BACKOUT: {len(bug_ids)}") + # 2. bug_commits = map_bugs_to_commit(bug_ids) + print(f"NUMBER OF BUGS THAT WERE MAPPED TO A COMMIT: {len(bug_commits)}") + # 3. hg_client = Revision() backed_out_revisions = find_bugs(hg_client, bug_ids, bug_commits) + print(f"NUMBER OF BACKED OUT REVISIONS FOUND: {len(backed_out_revisions)}") + # 4. revisions_to_commits = defaultdict(list) @@ -235,6 +224,10 @@ def main(): if revision_id in backed_out_revisions: revisions_to_commits[revision_id].append(commit["node"]) + print( + f"NUMBER OF REVISIONS MAPPED TO BACKED OUT COMMITS: {len(revisions_to_commits)}" + ) + # 5. and 6. client_id = os.getenv("TC_CLIENT_ID") @@ -261,6 +254,7 @@ def main(): for revision_id, commits in revisions_to_commits.items(): if len(commits) < 2: + print("yo") continue for commit in commits: @@ -269,14 +263,9 @@ def main(): if error_lines: break - # if not error_lines: - # continue - commit_diff = repository.get_diff( repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[-1] ) - # if not commit_diff: - # continue commit_diff_encoded = commit_diff.decode("utf-8", errors="replace") From e47cc1bb01109cd973932c661c6decb06170b362 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Sun, 27 Apr 2025 16:46:59 -0400 Subject: [PATCH 21/24] Fixed params to include all bug types --- scripts/build_failure_data_collection.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 788ff36a53..c707e5a67f 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -2,9 +2,11 @@ import logging import os from collections import defaultdict +from datetime import datetime import requests import taskcluster +from dateutil.relativedelta import relativedelta from libmozdata.bugzilla import Bugzilla from libmozdata.hgmozilla import Revision from tqdm import tqdm @@ -31,16 +33,16 @@ def download_databases(): def get_bz_params(): fields = ["id"] - # one_year_ago = (datetime.now() - relativedelta(years=1)).strftime("%Y-%m-%d") + one_year_ago = (datetime.now() - relativedelta(years=1)).strftime("%Y-%m-%d") params = { "include_fields": fields, - "resolution": "---", - # "f1": "creation_ts", - # "o1": "greaterthan", - # "v1": one_year_ago, - "f1": "longdesc", - "o1": "allwordssubstr", - "v1": "backed out for causing build", + # "resolution": "---", + "f1": "creation_ts", + "o1": "greaterthan", + "v1": one_year_ago, + "f2": "longdesc", + "o2": "allwords", + "v2": "backed out causing build", } return params @@ -203,6 +205,7 @@ def main(): bug_ids = list(bugs.keys()) print(f"NUMBER OF BUGS FOUND THAT HAVE A BACKOUT: {len(bug_ids)}") + print(f"bug ids: {bug_ids}") # 2. bug_commits = map_bugs_to_commit(bug_ids) From 4d7d8073165bf6b82faf634878c6dc47c1dfbba2 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 29 Apr 2025 12:44:35 -0400 Subject: [PATCH 22/24] Changed limit to 2 years --- scripts/build_failure_data_collection.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index c707e5a67f..8733b71df2 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -33,10 +33,9 @@ def download_databases(): def get_bz_params(): fields = ["id"] - one_year_ago = (datetime.now() - relativedelta(years=1)).strftime("%Y-%m-%d") + one_year_ago = (datetime.now() - relativedelta(years=2)).strftime("%Y-%m-%d") params = { "include_fields": fields, - # "resolution": "---", "f1": "creation_ts", "o1": "greaterthan", "v1": one_year_ago, @@ -204,20 +203,13 @@ def main(): bugs = get_backed_out_build_failure_bugs() bug_ids = list(bugs.keys()) - print(f"NUMBER OF BUGS FOUND THAT HAVE A BACKOUT: {len(bug_ids)}") - print(f"bug ids: {bug_ids}") - # 2. bug_commits = map_bugs_to_commit(bug_ids) - print(f"NUMBER OF BUGS THAT WERE MAPPED TO A COMMIT: {len(bug_commits)}") - # 3. hg_client = Revision() backed_out_revisions = find_bugs(hg_client, bug_ids, bug_commits) - print(f"NUMBER OF BACKED OUT REVISIONS FOUND: {len(backed_out_revisions)}") - # 4. revisions_to_commits = defaultdict(list) From b9e64ddea7f17b1a2f7ae56a9dee01e7168829ca Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Tue, 29 Apr 2025 12:45:14 -0400 Subject: [PATCH 23/24] Fixed variable name --- scripts/build_failure_data_collection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index 8733b71df2..ea34ef384d 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -33,12 +33,12 @@ def download_databases(): def get_bz_params(): fields = ["id"] - one_year_ago = (datetime.now() - relativedelta(years=2)).strftime("%Y-%m-%d") + two_years_ago = (datetime.now() - relativedelta(years=2)).strftime("%Y-%m-%d") params = { "include_fields": fields, "f1": "creation_ts", "o1": "greaterthan", - "v1": one_year_ago, + "v1": two_years_ago, "f2": "longdesc", "o2": "allwords", "v2": "backed out causing build", From 3c85c4b26a5348767da8d17af9c1e40ea3b10601 Mon Sep 17 00:00:00 2001 From: Benjamin Mah Date: Wed, 30 Apr 2025 09:41:46 -0400 Subject: [PATCH 24/24] Removed print statement --- scripts/build_failure_data_collection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py index ea34ef384d..a2ad2fed67 100644 --- a/scripts/build_failure_data_collection.py +++ b/scripts/build_failure_data_collection.py @@ -219,10 +219,6 @@ def main(): if revision_id in backed_out_revisions: revisions_to_commits[revision_id].append(commit["node"]) - print( - f"NUMBER OF REVISIONS MAPPED TO BACKED OUT COMMITS: {len(revisions_to_commits)}" - ) - # 5. and 6. client_id = os.getenv("TC_CLIENT_ID")