diff --git a/bugbug/repository.py b/bugbug/repository.py index 341115b1c7..07d58c0031 100644 --- a/bugbug/repository.py +++ b/bugbug/repository.py @@ -1543,6 +1543,69 @@ def trigger_pull() -> None: trigger_pull() +def get_diff(repo_path, original_hash, fix_hash) -> bytes: + client = hglib.open(repo_path) + + current_rev = client.identify(id=True) + + try: + client.rawcommand([b"shelve"]) + except hglib.error.CommandError as e: + if b"nothing changed" in e.out: + logger.info(f"Nothing to shelve: {e}") + else: + raise RuntimeError("Error occurred while shelving") from e + + parents = client.parents(rev=fix_hash) + parent_of_fix = parents[0][1] + client.update(rev=parent_of_fix, clean=True) + + graft_result = graft( + client, revs=[original_hash], no_commit=True, force=True, tool=":merge" + ) + + if not graft_result: + return b"" + + final_diff = client.diff( + revs=[fix_hash], ignoreallspace=True, ignorespacechange=True, reverse=True + ) + + client.update(rev=current_rev, clean=True) + + return final_diff + + +def graft(client, revs, no_commit=False, force=False, tool=":merge") -> bool: + """Graft changesets specified by revs into the current repository state. + + Args: + client: The hglib client. + revs: A list of the hashes of the commits to be applied to the current repository state. + no_commit: If True, does not commit and just applies changes in working directory. + force: If True, forces the grafts even if the revs are ancestors of the current repository state. + tool: A string representing a merge tool (see `hg help merge-tools`). + + Returns: + Boolean of graft operation result (True for success, False for failure). + """ + args = hglib.util.cmdbuilder( + str.encode("graft"), r=revs, no_commit=no_commit, f=force, tool=tool + ) + + eh = hglib.util.reterrorhandler(args) + + client.rawcommand(args, eh=eh, prompt=auto_resolve_conflict_prompt) + + return True + + +def auto_resolve_conflict_prompt(max_bytes, current_output): + if b"was deleted in" in current_output: + return b"c\n" # Return 'c' to use the changed version + return b"\n" # Default to doing nothing, just proceed + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("repository_dir", help="Path to the repository", action="store") diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py new file mode 100644 index 0000000000..a2ad2fed67 --- /dev/null +++ b/scripts/build_failure_data_collection.py @@ -0,0 +1,277 @@ +import csv +import logging +import os +from collections import defaultdict +from datetime import datetime + +import requests +import taskcluster +from dateutil.relativedelta import relativedelta +from libmozdata.bugzilla import Bugzilla +from libmozdata.hgmozilla import Revision +from tqdm import tqdm + +from bugbug import bugzilla, db, phabricator, repository + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def download_databases(): + logger.info("Cloning Mercurial database...") + repository.clone(repo_dir="hg_dir") + + logger.info("Downloading bugs database...") + assert db.download(bugzilla.BUGS_DB) + + logger.info("Downloading commits database...") + assert db.download(repository.COMMITS_DB, support_files_too=True) + + logger.info("Downloading revisions database...") + assert db.download(phabricator.REVISIONS_DB, support_files_too=True) + + +def get_bz_params(): + fields = ["id"] + two_years_ago = (datetime.now() - relativedelta(years=2)).strftime("%Y-%m-%d") + params = { + "include_fields": fields, + "f1": "creation_ts", + "o1": "greaterthan", + "v1": two_years_ago, + "f2": "longdesc", + "o2": "allwords", + "v2": "backed out causing build", + } + return params + + +def get_backed_out_build_failure_bugs(date="today", bug_ids=[], chunk_size=None): + params = get_bz_params() + bugs = {} + + def bug_handler(bug, data): + data[bug["id"]] = bug + + Bugzilla( + params, + bughandler=bug_handler, + bugdata=bugs, + ).get_data().wait() + + return bugs + + +def map_bugs_to_commit(bug_ids): + logger.info("Mapping bugs to their commits...") + bug_commits = {} + + for commit in tqdm( + repository.get_commits( + include_no_bug=True, include_backouts=True, include_ignored=True + ) + ): + if commit["bug_id"] not in bug_ids: + continue + + commit_data = { + key: commit[key] + for key in ["node", "bug_id", "pushdate", "backedoutby", "backsout", "desc"] + } + + bug_commits.setdefault(commit["bug_id"], []).append(commit_data) + + return bug_commits + + +def find_bugs(hg_client, bug_ids, bug_commits): + logger.info("Finding bugs...") + backed_out_revisions = [] + + for bug_id in bug_ids: + bug_id_commits = bug_commits.get(bug_id, None) + backing_out_commit = find_backing_out_commit(bug_id_commits, hg_client) + + if not backing_out_commit: + continue + + logger.info(f"Backing out commit found for bug {bug_id}: {backing_out_commit}") + + commits = [ + { + "desc": c["desc"], + } + for c in bug_id_commits + if any( + c["node"].startswith(node) for node in backing_out_commit["backsout"] + ) + ] + + if commits is None: + continue + + for commit in commits: + revision_id = repository.get_revision_id(commit) + backed_out_revisions.append(revision_id) + + return backed_out_revisions + + +def find_backing_out_commit(commits, hg_client): + logger.info("Finding backing out commit...") + if not commits: + return None + + backout_commits = [commit for commit in commits if commit["backsout"]] + if len(backout_commits) > 1: + logger.info("Multiple backouts detected, skipping this bug.") + return None + + for commit in commits: + if not commit["backsout"]: + continue + + desc = commit["desc"] + if ( + "backed out" in desc.lower() + and "for causing" in desc.lower() + and "build" in desc.lower() + ): + return commit + return None + + +def find_error_lines(index_client, queue_client, commit_node): + # FINAL STEPS + # 1. list the tasks + tasks = index_client.listTasks(f"gecko.v2.autoland.revision.{commit_node}.firefox") + + if not tasks["tasks"]: + return [] + + # 2. get the task ID from one of the tasks (I think any is fine) + first_task_id = tasks["tasks"][0]["taskId"] + + # 3. get the task group ID from the task ID + first_task = queue_client.task(first_task_id) + task_group_id = first_task["taskGroupId"] + + # 4. extract the build task IDs from the task group ID + url = f"https://firefoxci.taskcluster-artifacts.net/{task_group_id}/0/public/label-to-taskid.json" + response = requests.get(url) + response.raise_for_status() + data = response.json() + + build_tasks = set() + + for label, taskId in data.items(): + if label[:5] == "build": + build_tasks.add(taskId) + + # 5. get failed tasks + failed_tasks = set() + + for task in queue_client.listTaskGroup(task_group_id)["tasks"]: + if task["status"]["state"] == "failed": + failed_tasks.add(task["status"]["taskId"]) + + # 6. find intersection between build tasks and failed tasks + failed_build_tasks = list(build_tasks & failed_tasks) + + # 7. get the url to access the log, load it, and extract the ERROR lines + error_lines = [] + + for failed_build_task in failed_build_tasks: + artifact = queue_client.getArtifact( + taskId=failed_build_task, runId="0", name="public/logs/live.log" + ) + url = artifact["url"] + + response = requests.get(url) + error_lines.extend( + [line for line in response.text.split("\n") if "ERROR - " in line] + ) + + return error_lines + + +def main(): + # 0. + download_databases() + + # 1. + bugs = get_backed_out_build_failure_bugs() + bug_ids = list(bugs.keys()) + + # 2. + bug_commits = map_bugs_to_commit(bug_ids) + + # 3. + hg_client = Revision() + backed_out_revisions = find_bugs(hg_client, bug_ids, bug_commits) + + # 4. + revisions_to_commits = defaultdict(list) + + for commit in repository.get_commits(): + revision_id = repository.get_revision_id(commit) + + if revision_id in backed_out_revisions: + revisions_to_commits[revision_id].append(commit["node"]) + + # 5. and 6. + + client_id = os.getenv("TC_CLIENT_ID") + + index = taskcluster.Index( + { + "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + "credentials": {"clientId": client_id}, + } + ) + + queue = taskcluster.Queue( + { + "rootUrl": "https://firefox-ci-tc.services.mozilla.com", + } + ) + + with open("revisions.csv", mode="w", newline="", encoding="utf-8") as file: + writer = csv.writer(file) + + writer.writerow( + ["Revision ID", "Initial Commit", "Fix Commit", "Interdiff", "Error Lines"] + ) + + for revision_id, commits in revisions_to_commits.items(): + if len(commits) < 2: + print("yo") + continue + + for commit in commits: + error_lines = find_error_lines(index, queue, commit) + + if error_lines: + break + + commit_diff = repository.get_diff( + repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[-1] + ) + + commit_diff_encoded = commit_diff.decode("utf-8", errors="replace") + + writer.writerow( + [revision_id, commits[0], commits[1], commit_diff_encoded, error_lines] + ) + + +if __name__ == "__main__": + main() + +# 0. Download databases +# 1. Identify bugs in Bugzilla that have a backout due to build failures X +# 2. Map only these bugs' commits to the bug ID in a dict +# 3. Find the revision from the bug +# 4. Map the revision to the commits +# 5. Get the interdiff +# 6. Find error lines in the interdiff