dask · fjetter · Aug 8, 2022 · Aug 5, 2022 · Aug 8, 2022
@@ -33,7 +33,8 @@ tags
 .mypy_cache/
 
 reports/
-test_report.*
+test_report*.html
+test_report*.db
 test_short_report.html
 
 # Test failures will dump the cluster state in here

@@ -9,7 +9,7 @@
 import sys
 import zipfile
 from collections.abc import Iterator
-from typing import Any
+from typing import Any, Iterable, cast
 
 import altair
 import altair_saver
@@ -102,6 +102,39 @@ def maybe_get_next_page_path(response: requests.Response) -> str | None:
     return next_page_path
 
 
+def get_jobs(workflow):
+    with shelve.open("test_report_jobs") as cache:
+        url = workflow["jobs_url"]
+        try:
+            jobs = cache[url]
+        except KeyError:
+            params = {"per_page": 100}
+            r = get_from_github(workflow["jobs_url"], params)
+            jobs = r.json()["jobs"]
+            while next_page := maybe_get_next_page_path(r):
+                r = get_from_github(next_page, params=params)
+                jobs.extend(r.json()["jobs"])
+            cache[url] = jobs
+
+    df_jobs = pandas.DataFrame.from_records(jobs)
+    extracted = df_jobs.name.str.extract(
+        r"\(([\w\-]+), (\d\.\d+),\s([\w|\s]+)\)"
+    ).dropna()
+    df_jobs["OS"] = extracted[0]
+    df_jobs["python_version"] = extracted[1]
+    df_jobs["partition"] = extracted[2]
+    # We later need to join on this. Somehow the job ID is not part of the workflow schema and we have no other way to join
+    df_jobs["suite_name"] = (
+        df_jobs["OS"]
+        + "-"
+        + df_jobs["python_version"]
+        + "-"
+        + df_jobs["partition"].str.replace(" ", "")
+    )
+
+    return df_jobs
+
+
 def get_workflow_listing(repo: str, branch: str, event: str, days: int) -> list[dict]:
     """
     Get a list of workflow runs from GitHub actions.
@@ -153,26 +186,32 @@ def download_and_parse_artifact(url: str) -> junitparser.JUnitXml | None:
     """
     Download the artifact at the url parse it.
     """
+    with shelve.open("test_report") as cache:
+        try:
+            xml_raw = cache[url]
+        except KeyError:
+            r = get_from_github(url, params={})
+            f = zipfile.ZipFile(io.BytesIO(r.content))
+            cache[url] = xml_raw = f.read(f.filelist[0].filename)
     try:
-        r = get_from_github(url, params={})
-        f = zipfile.ZipFile(io.BytesIO(r.content))
-        run = junitparser.JUnitXml.fromstring(f.read(f.filelist[0].filename))
-        return run
+        return junitparser.JUnitXml.fromstring(xml_raw)
     except Exception:
-        print(f"Failed to download/parse {url}")
+        # XMLs also include things like schedule which is a simple json
         return None
 
 
-def dataframe_from_jxml(run: list) -> pandas.DataFrame:
+def dataframe_from_jxml(run: Iterable) -> pandas.DataFrame:
     """
     Turn a parsed JXML into a pandas dataframe
     """
     fname = []
     tname = []
     status = []
     message = []
+    sname = []
     for suite in run:
         for test in suite:
+            sname.append(suite.name)
             fname.append(test.classname)
             tname.append(test.name)
             s = "✓"
@@ -195,7 +234,13 @@ def dataframe_from_jxml(run: list) -> pandas.DataFrame:
             status.append(s)
             message.append(html.escape(m))
     df = pandas.DataFrame(
-        {"file": fname, "test": tname, "status": status, "message": message}
+        {
+            "file": fname,
+            "test": tname,
+            "status": status,
+            "message": message,
+            "suite_name": sname,
+        }
     )
 
     # There are sometimes duplicate tests in the report for some unknown reason.
@@ -212,7 +257,7 @@ def dedup(group):
         else:
             return group
 
-    return df.groupby(["file", "test"]).agg(dedup)
+    return df.groupby(["file", "test"], as_index=False).agg(dedup)
 
 
 def download_and_parse_artifacts(
@@ -258,40 +303,39 @@ def download_and_parse_artifacts(
     ndownloaded = 0
     print(f"Downloading and parsing {nartifacts} artifacts...")
 
-    cache: shelve.Shelf[pandas.DataFrame | None]
-    # FIXME https://github.com/python/typeshed/pull/8190
-    with shelve.open("test_report") as cache:  # type: ignore[assignment]
-        for w in workflows:
-            w["dfs"] = []
-            for a in w["artifacts"]:
-                url = a["archive_download_url"]
-                df: pandas.DataFrame | None
-                try:
-                    df = cache[url]
-                except KeyError:
-                    xml = download_and_parse_artifact(url)
-                    if xml:
-                        df = dataframe_from_jxml(xml)
-                        # Note: we assign a column with the workflow timestamp rather
-                        # than the artifact timestamp so that artifacts triggered under
-                        # the same workflow can be aligned according to the same trigger
-                        # time.
-                        df = df.assign(
-                            name=a["name"],
-                            suite=suite_from_name(a["name"]),
-                            date=w["created_at"],
-                            url=w["html_url"],
-                        )
-                    else:
-                        df = None
-                    cache[url] = df
-
-                if df is not None:
-                    yield df
-
-                ndownloaded += 1
-                if ndownloaded and not ndownloaded % 20:
-                    print(f"{ndownloaded}... ", end="")
+    for w in workflows:
+        jobs_df = get_jobs(w)
+        w["dfs"] = []
+        for a in w["artifacts"]:
+            url = a["archive_download_url"]
+            df: pandas.DataFrame | None
+            xml = download_and_parse_artifact(url)
+            if xml is None:
+                continue
+            df = dataframe_from_jxml(cast(Iterable, xml))
+            # Note: we assign a column with the workflow timestamp rather
+            # than the artifact timestamp so that artifacts triggered under
+            # the same workflow can be aligned according to the same trigger
+            # time.
+            html_url = jobs_df[jobs_df["suite_name"] == a["name"]].html_url.unique()
+            assert (
+                len(html_url) == 1
+            ), f"Artifact suit name {a['name']} did not match any jobs dataframe {jobs_df['suite_name'].unique()}"
+            html_url = html_url[0]
+            assert html_url is not None
+            df2 = df.assign(
+                name=a["name"],
+                suite=suite_from_name(a["name"]),
+                date=w["created_at"],
+                html_url=html_url,
+            )
+
+            if df2 is not None:
+                yield df2
+
+            ndownloaded += 1
+            if ndownloaded and not ndownloaded % 20:
+                print(f"{ndownloaded}... ", end="")
 
 
 def main(argv: list[str] | None = None) -> None:
@@ -313,10 +357,20 @@ def main(argv: list[str] | None = None) -> None:
             max_workflows=args.max_workflows,
         )
     )
-
     total = pandas.concat(dfs, axis=0)
+    # Reduce the size of the DF since the entire thing is encoded in the vega spec
+    required_columns = [
+        "test",
+        "date",
+        "suite",
+        "file",
+        "html_url",
+        "status",
+        "message",
+    ]
+    total = total[required_columns]
     grouped = (
-        total.groupby(total.index)
+        total.groupby([total.file, total.test])
         .filter(lambda g: (g.status == "x").sum() >= args.nfails)
         .reset_index()
         .assign(test=lambda df: df.file + "." + df.test)
@@ -356,15 +410,15 @@ def main(argv: list[str] | None = None) -> None:
             .encode(
                 x=altair.X("date:O", scale=altair.Scale(domain=sorted(list(times)))),
                 y=altair.Y("suite:N", title=None),
-                href=altair.Href("url:N"),
+                href=altair.Href("html_url:N"),
                 color=altair.Color(
                     "status:N",
                     scale=altair.Scale(
                         domain=list(COLORS.keys()),
                         range=list(COLORS.values()),
                     ),
                 ),
-                tooltip=["suite:N", "date:O", "status:N", "message:N", "url:N"],
+                tooltip=["suite:N", "date:O", "status:N", "message:N", "html_url:N"],
             )
             .properties(title=name)
             | altair.Chart(df_agg.assign(_="_"))