mozilla · benjaminmah · Nov 5, 2024 · Nov 6, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/bugbug/repository.py b/bugbug/repository.py
@@ -1543,6 +1543,69 @@ def trigger_pull() -> None:
     trigger_pull()
 
 
+def get_diff(repo_path, original_hash, fix_hash) -> bytes:
+    client = hglib.open(repo_path)
+
+    current_rev = client.identify(id=True)
+
+    try:
+        client.rawcommand([b"shelve"])
+    except hglib.error.CommandError as e:
+        if b"nothing changed" in e.out:
+            logger.info(f"Nothing to shelve: {e}")
+        else:
+            raise RuntimeError("Error occurred while shelving") from e
+
+    parents = client.parents(rev=fix_hash)
+    parent_of_fix = parents[0][1]
+    client.update(rev=parent_of_fix, clean=True)
+
+    graft_result = graft(
+        client, revs=[original_hash], no_commit=True, force=True, tool=":merge"
+    )
+
+    if not graft_result:
+        return b""
+
+    final_diff = client.diff(
+        revs=[fix_hash], ignoreallspace=True, ignorespacechange=True, reverse=True
+    )
+
+    client.update(rev=current_rev, clean=True)
+
+    return final_diff
+
+
+def graft(client, revs, no_commit=False, force=False, tool=":merge") -> bool:
+    """Graft changesets specified by revs into the current repository state.
+
+    Args:
+        client: The hglib client.
+        revs: A list of the hashes of the commits to be applied to the current repository state.
+        no_commit: If True, does not commit and just applies changes in working directory.
+        force: If True, forces the grafts even if the revs are ancestors of the current repository state.
+        tool: A string representing a merge tool (see `hg help merge-tools`).
+
+    Returns:
+        Boolean of graft operation result (True for success, False for failure).
+    """
+    args = hglib.util.cmdbuilder(
+        str.encode("graft"), r=revs, no_commit=no_commit, f=force, tool=tool
+    )
+
+    eh = hglib.util.reterrorhandler(args)
+
+    client.rawcommand(args, eh=eh, prompt=auto_resolve_conflict_prompt)
+
+    return True
+
+
+def auto_resolve_conflict_prompt(max_bytes, current_output):
+    if b"was deleted in" in current_output:
+        return b"c\n"  # Return 'c' to use the changed version
+    return b"\n"  # Default to doing nothing, just proceed
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("repository_dir", help="Path to the repository", action="store")

diff --git a/scripts/build_failure_data_collection.py b/scripts/build_failure_data_collection.py
@@ -0,0 +1,277 @@
+import csv
+import logging
+import os
+from collections import defaultdict
+from datetime import datetime
+
+import requests
+import taskcluster
+from dateutil.relativedelta import relativedelta
+from libmozdata.bugzilla import Bugzilla
+from libmozdata.hgmozilla import Revision
+from tqdm import tqdm
+
+from bugbug import bugzilla, db, phabricator, repository
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def download_databases():
+    logger.info("Cloning Mercurial database...")
+    repository.clone(repo_dir="hg_dir")
+
+    logger.info("Downloading bugs database...")
+    assert db.download(bugzilla.BUGS_DB)
+
+    logger.info("Downloading commits database...")
+    assert db.download(repository.COMMITS_DB, support_files_too=True)
+
+    logger.info("Downloading revisions database...")
+    assert db.download(phabricator.REVISIONS_DB, support_files_too=True)
+
+
+def get_bz_params():
+    fields = ["id"]
+    two_years_ago = (datetime.now() - relativedelta(years=2)).strftime("%Y-%m-%d")
+    params = {
+        "include_fields": fields,
+        "f1": "creation_ts",
+        "o1": "greaterthan",
+        "v1": two_years_ago,
+        "f2": "longdesc",
+        "o2": "allwords",
+        "v2": "backed out causing build",
+    }
+    return params
+
+
+def get_backed_out_build_failure_bugs(date="today", bug_ids=[], chunk_size=None):
+    params = get_bz_params()
+    bugs = {}
+
+    def bug_handler(bug, data):
+        data[bug["id"]] = bug
+
+    Bugzilla(
+        params,
+        bughandler=bug_handler,
+        bugdata=bugs,
+    ).get_data().wait()
+
+    return bugs
+
+
+def map_bugs_to_commit(bug_ids):
+    logger.info("Mapping bugs to their commits...")
+    bug_commits = {}
+
+    for commit in tqdm(
+        repository.get_commits(
+            include_no_bug=True, include_backouts=True, include_ignored=True
+        )
+    ):
+        if commit["bug_id"] not in bug_ids:
+            continue
+
+        commit_data = {
+            key: commit[key]
+            for key in ["node", "bug_id", "pushdate", "backedoutby", "backsout", "desc"]
+        }
+
+        bug_commits.setdefault(commit["bug_id"], []).append(commit_data)
+
+    return bug_commits
+
+
+def find_bugs(hg_client, bug_ids, bug_commits):
+    logger.info("Finding bugs...")
+    backed_out_revisions = []
+
+    for bug_id in bug_ids:
+        bug_id_commits = bug_commits.get(bug_id, None)
+        backing_out_commit = find_backing_out_commit(bug_id_commits, hg_client)
+
+        if not backing_out_commit:
+            continue
+
+        logger.info(f"Backing out commit found for bug {bug_id}: {backing_out_commit}")
+
+        commits = [
+            {
+                "desc": c["desc"],
+            }
+            for c in bug_id_commits
+            if any(
+                c["node"].startswith(node) for node in backing_out_commit["backsout"]
+            )
+        ]
+
+        if commits is None:
+            continue
+
+        for commit in commits:
+            revision_id = repository.get_revision_id(commit)
+            backed_out_revisions.append(revision_id)
+
+    return backed_out_revisions
+
+
+def find_backing_out_commit(commits, hg_client):
+    logger.info("Finding backing out commit...")
+    if not commits:
+        return None
+
+    backout_commits = [commit for commit in commits if commit["backsout"]]
+    if len(backout_commits) > 1:
+        logger.info("Multiple backouts detected, skipping this bug.")
+        return None
+
+    for commit in commits:
+        if not commit["backsout"]:
+            continue
+
+        desc = commit["desc"]
+        if (
+            "backed out" in desc.lower()
+            and "for causing" in desc.lower()
+            and "build" in desc.lower()
+        ):
+            return commit
+    return None
+
+
+def find_error_lines(index_client, queue_client, commit_node):
+    # FINAL STEPS
+    # 1. list the tasks
+    tasks = index_client.listTasks(f"gecko.v2.autoland.revision.{commit_node}.firefox")
+
+    if not tasks["tasks"]:
+        return []
+
+    # 2. get the task ID from one of the tasks (I think any is fine)
+    first_task_id = tasks["tasks"][0]["taskId"]
+
+    # 3. get the task group ID from the task ID
+    first_task = queue_client.task(first_task_id)
+    task_group_id = first_task["taskGroupId"]
+
+    # 4. extract the build task IDs from the task group ID
+    url = f"https://firefoxci.taskcluster-artifacts.net/{task_group_id}/0/public/label-to-taskid.json"
+    response = requests.get(url)
+    response.raise_for_status()
+    data = response.json()
+
+    build_tasks = set()
+
+    for label, taskId in data.items():
+        if label[:5] == "build":
+            build_tasks.add(taskId)
+
+    # 5. get failed tasks
+    failed_tasks = set()
+
+    for task in queue_client.listTaskGroup(task_group_id)["tasks"]:
+        if task["status"]["state"] == "failed":
+            failed_tasks.add(task["status"]["taskId"])
+
+    # 6. find intersection between build tasks and failed tasks
+    failed_build_tasks = list(build_tasks & failed_tasks)
+
+    # 7. get the url to access the log, load it, and extract the ERROR lines
+    error_lines = []
+
+    for failed_build_task in failed_build_tasks:
+        artifact = queue_client.getArtifact(
+            taskId=failed_build_task, runId="0", name="public/logs/live.log"
+        )
+        url = artifact["url"]
+
+        response = requests.get(url)
+        error_lines.extend(
+            [line for line in response.text.split("\n") if "ERROR - " in line]
+        )
+
+    return error_lines
+
+
+def main():
+    # 0.
+    download_databases()
+
+    # 1.
+    bugs = get_backed_out_build_failure_bugs()
+    bug_ids = list(bugs.keys())
+
+    # 2.
+    bug_commits = map_bugs_to_commit(bug_ids)
+
+    # 3.
+    hg_client = Revision()
+    backed_out_revisions = find_bugs(hg_client, bug_ids, bug_commits)
+
+    # 4.
+    revisions_to_commits = defaultdict(list)
+
+    for commit in repository.get_commits():
+        revision_id = repository.get_revision_id(commit)
+
+        if revision_id in backed_out_revisions:
+            revisions_to_commits[revision_id].append(commit["node"])
+
+    # 5. and 6.
+
+    client_id = os.getenv("TC_CLIENT_ID")
+
+    index = taskcluster.Index(
+        {
+            "rootUrl": "https://firefox-ci-tc.services.mozilla.com",
+            "credentials": {"clientId": client_id},
+        }
+    )
+
+    queue = taskcluster.Queue(
+        {
+            "rootUrl": "https://firefox-ci-tc.services.mozilla.com",
+        }
+    )
+
+    with open("revisions.csv", mode="w", newline="", encoding="utf-8") as file:
+        writer = csv.writer(file)
+
+        writer.writerow(
+            ["Revision ID", "Initial Commit", "Fix Commit", "Interdiff", "Error Lines"]
+        )
+
+        for revision_id, commits in revisions_to_commits.items():
+            if len(commits) < 2:
+                print("yo")
+                continue
+
+            for commit in commits:
+                error_lines = find_error_lines(index, queue, commit)
+
+                if error_lines:
+                    break
+
+            commit_diff = repository.get_diff(
+                repo_path="hg_dir", original_hash=commits[0], fix_hash=commits[-1]
+            )
+
+            commit_diff_encoded = commit_diff.decode("utf-8", errors="replace")
+
+            writer.writerow(
+                [revision_id, commits[0], commits[1], commit_diff_encoded, error_lines]
+            )
+
+
+if __name__ == "__main__":
+    main()
+
+# 0. Download databases
+# 1. Identify bugs in Bugzilla that have a backout due to build failures X
+# 2. Map only these bugs' commits to the bug ID in a dict
+# 3. Find the revision from the bug
+# 4. Map the revision to the commits
+# 5. Get the interdiff
+# 6. Find error lines in the interdiff