diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 81b6095712..5195d69cd2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,16 +57,19 @@ jobs: if [[ "$COMMIT" == *"test-upstream"* || ${{ github.event_name }} == "schedule" ]] then export TEST_UPSTREAM="true" + export AB_BASELINE="coiled-upstream-py3.9 coiled-0.1.0-py3.9" else export TEST_UPSTREAM="false" + export AB_BASELINE="coiled-latest-py3.9 coiled-0.1.0-py3.9" fi # Put TEST_UPSTREAM into $GITHUB_ENV so it can be used in subsequent workflow steps echo $TEST_UPSTREAM echo TEST_UPSTREAM=$TEST_UPSTREAM >> $GITHUB_ENV - # Put TEST_UPSTREAM into a file so it can be downloaded in subsequent workflow jobs + # Put env variables into files so it can be downloaded in subsequent workflow jobs echo $TEST_UPSTREAM > test_upstream.txt + echo $AB_BASELINE > ab_baseline.txt - name: Build Coiled Software Environment env: @@ -104,6 +107,7 @@ jobs: latest.yaml software_name.txt test_upstream.txt + ab_baseline.txt runtime: name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} @@ -437,7 +441,7 @@ jobs: static-site: needs: process-results # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run) - if: always() && github.ref == 'refs/heads/main' && github.repository == 'coiled/coiled-runtime' + if: always() name: Build static dashboards runs-on: ubuntu-latest steps: @@ -457,12 +461,24 @@ jobs: python-version: "3.9" environment-file: ci/environment-dashboard.yml + - name: Download software environment assets + uses: actions/download-artifact@v3 + with: + name: software-environment-py3.9 + - name: Generate dashboards run: | - python dashboard.py + python dashboard.py -d benchmark.db -o static -b $(cat ab_baseline.txt) + + - name: Upload artifact + uses: actions/upload-artifact@v3 + with: + name: static-dashboard + path: static - name: Deploy 🚀 uses: JamesIves/github-pages-deploy-action@4.1.7 + if: github.ref == 'refs/heads/main' && github.repository == 'coiled/coiled-runtime' with: branch: gh-pages folder: static diff --git a/.gitignore b/.gitignore index c3d511b0c9..272b49eb25 100644 --- a/.gitignore +++ b/.gitignore @@ -120,6 +120,9 @@ venv.bak/ # Rope project settings .ropeproject +# PyCharm project settings +.idea + # mkdocs documentation /site diff --git a/ci/environment-dashboard.yml b/ci/environment-dashboard.yml index bf298d61e6..4ebdbd3ca1 100644 --- a/ci/environment-dashboard.yml +++ b/ci/environment-dashboard.yml @@ -11,6 +11,7 @@ dependencies: - dask - dask-ml - distributed + - xarray - xgboost - pandas - tabulate diff --git a/dashboard.py b/dashboard.py index 42b03d91ff..78b0669248 100644 --- a/dashboard.py +++ b/dashboard.py @@ -1,12 +1,11 @@ from __future__ import annotations -import collections +import argparse import glob import importlib import inspect -import os import pathlib -import sys +from typing import Literal, NamedTuple import altair import pandas @@ -17,62 +16,181 @@ panel.extension("vega") -def get_test_source(): - """ - Crawl the tests directory and try to grab code for each test on a best-effort - basis. This relies on the tests being importable from this script, so the - environment should be similar enough that that is possible. +class ChartSpec(NamedTuple): + field_name: str + field_desc: str + unit: str + scale: float + + +SPECS = [ + ChartSpec("duration", "Wall Clock", "(s)", 1), + ChartSpec("average_memory", "Average Memory", "(GiB)", 2**30), + ChartSpec("peak_memory", "Peak Memory", "(GiB)", 2**30), +] + + +source: dict[str, str] = {} + + +def load_test_source() -> None: + """Crawl the tests directory and try to grab code for each test. This relies on the + tests being importable from this script. """ - source: dict[str, str] = {} - files = glob.glob("tests/**/test_*.py", recursive=True) - for f in files: + for fname in glob.iglob("tests/**/test_*.py", recursive=True): try: - # Fragile! - mod = importlib.import_module(f.replace("/", ".")[: -len(".py")]) - tests = [a for a in dir(mod) if a.startswith("test_")] - for test in tests: - if fn := getattr(mod, test, None): - if not callable(fn): - continue - source[f[len("tests/") :] + "::" + test] = inspect.getsource(fn) - except BaseException: # Some pytest exceptions inherit directly from BaseException - pass - return source + mod = importlib.import_module(fname.replace("/", ".")[: -len(".py")]) + # Some pytest exceptions inherit directly from BaseException + except BaseException as e: + print(f"Could not import {fname}: {e.__class__.__name__}: {e}") + continue + tests = [a for a in dir(mod) if a.startswith("test_")] + for test in tests: + if (func := getattr(mod, test, None)) and callable(func): + # FIXME missing decorators, namely @pytest.mark.parametrize + source[fname[len("tests/") :] + "::" + test] = inspect.getsource(func) + + print(f"Discovered {len(source)} tests") -source = get_test_source() +def align_to_baseline(df: pandas.DataFrame, baseline: str) -> pandas.DataFrame | None: + """Add columns + - duration_baseline + - average_memory_baseline + - peak_memory_baseline + - duration_delta (A/B - 1) + - average_memory_delta (A/B - 1) + - peak_memory_delta (A/B - 1) -def make_timeseries(originalname, df, spec) -> altair.Chart | None: + Baseline values are from the matching rows given the same test name and the baseline + runtime. Note that this means that df is expected to have exactly 1 test in the + baseline runtime for each test in every other runtime. """ - Make a single timeseries altair chart for a given test. + df_baseline = df[df["runtime"] == baseline] - originalname: str - The name of the test without any fixture or other modifications. + if df_baseline.empty: + # Typically a misspelling. However, this can legitimately happen in CI if all + # three jobs of the baseline runtime failed early. + print( + f"Baseline runtime {baseline!r} not found; valid choices are:", + ", ".join(df["runtime"].unique()), + ) + return None - df: pandas.DataFrame - A dataframe with the test data in it. + baseline_names = df_baseline["fullname"].unique() + all_names = df["fullname"].unique() - spec: ChartSpec - Data for how to render the timeseries - """ - df = df.dropna(subset=[spec.field, "start"]) + assert len(baseline_names) == df_baseline.shape[0] + if len(baseline_names) < len(all_names): + # This will happen in CI if one or two out of three jobs of the baseline failed. + # Note that df contains the latest run only. It means that tests on all runtimes + # (including historical ones) should be from the coiled-runtime git tip, so + # adding or removing tests should not cause a mismatch. + print( + f"Baseline runtime {baseline!r} is missing some tests:", + ", ".join(set(all_names) - set(baseline_names)), + ) + return None + + columns = [spec.field_name for spec in SPECS] + df_baseline = ( + df_baseline.set_index("fullname") + .loc[df["fullname"], columns] + .rename(columns={k: k + "_baseline" for k in columns}) + ) + df_baseline.index = df.index + df = pandas.concat([df, df_baseline], axis=1) + for column in columns: + df[column + "_delta"] = (df[column] / df[column + "_baseline"] - 1) * 100 + return df + + +def make_barchart( + df: pandas.DataFrame, + spec: ChartSpec, + title: str, + baseline: str | None, +) -> altair.Chart | None: + """Make a single Altair barchart for a given test or runtime""" + df = df.dropna(subset=[spec.field_name, "start"]) if not len(df): + # Some tests do not have average_memory or peak_memory measures, only runtime return None - df = df.assign(**{spec.field: df[spec.field] / spec.scale}) + + fields = [ + spec.field_name, + "fullname", + "fullname_no_category", + "dask_version", + "distributed_version", + "runtime", + ] + + height = df.shape[0] * 20 + 50 + tooltip = [ + altair.Tooltip("fullname:N", title="Test"), + altair.Tooltip("dask_version:N", title="Dask"), + altair.Tooltip("distributed_version:N", title="Distributed"), + altair.Tooltip(f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"), + ] + + by_test = len(df["fullname"].unique()) == 1 + if by_test: + df = df.sort_values("runtime", key=runtime_sort_key_pd) + y = altair.Y("runtime", title="Runtime", sort=None) + else: + y = altair.Y("fullname_no_category", title="Test name") + + if baseline: + fields += [ + f"{spec.field_name}_delta", + f"{spec.field_name}_baseline", + ] + x = altair.X( + f"{spec.field_name}_delta", + title=f"{spec.field_desc} (delta % from {baseline})", + ) + tooltip += [ + altair.Tooltip( + f"{spec.field_name}_baseline:Q", title=f"{baseline} {spec.unit}" + ), + altair.Tooltip(f"{spec.field_name}_delta:Q", title="Delta %"), + ] + else: + x = altair.X(spec.field_name, title=f"{spec.field_desc} {spec.unit}") + + return ( + altair.Chart(df[fields], width=800, height=height) + .mark_bar() + .encode(x=x, y=y, tooltip=tooltip) + .properties(title=title) + .configure(autosize="fit") + ) + + +def make_timeseries( + df: pandas.DataFrame, spec: ChartSpec, title: str +) -> altair.Chart | None: + """Make a single Altair timeseries chart for a given test""" + df = df.dropna(subset=[spec.field_name, "start"]) + if not len(df): + # Some tests do not have average_memory or peak_memory measures, only runtime + return None + df = df.fillna({"ci_run_url": "https://github.com/coiled/coiled-runtime"}) - path = df.path.iloc[0] kwargs = {} # Reduce the size of the altair spec df = df[ [ - spec.field, + spec.field_name, "start", "ci_run_url", "name", "call_outcome", "coiled_runtime_version", "dask_version", + "distributed_version", ] ] if len(df.name.unique()) > 1: @@ -88,7 +206,7 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None: .mark_line(point=altair.OverlayMarkDef(size=64)) .encode( x=altair.X("start:T"), - y=altair.Y(f"{spec.field}:Q", title=spec.label), + y=altair.Y(f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"), href=altair.Href("ci_run_url:N"), tooltip=[ altair.Tooltip("name:N", title="Test Name"), @@ -96,66 +214,268 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None: altair.Tooltip("call_outcome:N", title="Test Outcome"), altair.Tooltip("coiled_runtime_version:N", title="Coiled Runtime"), altair.Tooltip("dask_version:N", title="Dask"), - altair.Tooltip(f"{spec.field}:Q", title=spec.label), + altair.Tooltip("distributed_version:N", title="Distributed"), + altair.Tooltip( + f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}" + ), altair.Tooltip("ci_run_url:N", title="CI Run URL"), ], **kwargs, ) - .properties(title=f"{path}::{originalname}") + .properties(title=title) .configure(autosize="fit") ) -def make_test_report(group_keys, df): - """ - Make a tab panel for a single test. - - originalname: str - The name of the test without any fixture or other modifications. - - df: pandas.DataFrame - A dataframe with the test data in it. - """ - path, originalname = group_keys - - ChartSpec = collections.namedtuple("ChartSpec", ["field", "scale", "label"]) - specs = [ - ChartSpec("duration", 1, "Wall Clock (s)"), - ChartSpec("average_memory", 1024**3, "Average Memory (GiB)"), - ChartSpec("peak_memory", 1024**3, "Peak Memory (GiB)"), - ] +def make_test_report( + df: pandas.DataFrame, + kind: Literal["barchart" | "timeseries"], + title: str, + sourcename: str | None = None, + baseline: str | None = None, +) -> panel.Tabs: + """Make a tab panel for a single test""" tabs = [] - for s in specs: - chart = make_timeseries(originalname, df, s) + for spec in SPECS: + if kind == "timeseries": + assert not baseline + chart = make_timeseries(df, spec, title) + else: + chart = make_barchart(df, spec, title, baseline) if not chart: continue - tabs.append((s.label, chart)) + tabs.append((spec.field_desc, chart)) + + if kind == "timeseries": + height = 384 + else: + height = df.shape[0] * 20 + 50 - sourcename = path + "::" + originalname if sourcename in source: code = panel.pane.Markdown( f"```python\n{source[sourcename]}\n```", width=800, - height=384, + height=height, style={"overflow": "auto"}, ) tabs.append(("Source", code)) + elif sourcename is not None: + print("Source code not found for", sourcename) + return panel.Tabs(*tabs, margin=12, width=800) -if __name__ == "__main__": - DB_NAME = ( - sys.argv[1] if len(sys.argv) > 1 else os.environ.get("DB_NAME", "benchmark.db") +def make_timeseries_html_report( + df: pandas.DataFrame, output_dir: pathlib.Path, runtime: str +) -> None: + """Generate HTML report for one runtime (e.g. coiled-upstream-py3.9), showing + evolution of measures (wall clock, average memory, peak memory) over historical CI + runs. + + Create one tab for each test category (e.g. benchmarks, runtime, stability), + one graph for each test, + and one graph tab for each measure (wall clock, average memory, peak memory). + """ + out_fname = str(output_dir.joinpath(runtime + ".html")) + print(f"Generating {out_fname}") + categories = sorted(df[df.runtime == runtime].category.unique()) + tabs = [] + for category in categories: + df_by_test = ( + df[(df.runtime == runtime) & (df.category == category)] + .sort_values("sourcename") + .groupby("sourcename") + ) + panes = [ + make_test_report( + df_by_test.get_group(sourcename), + kind="timeseries", + title=sourcename, + sourcename=sourcename, + ) + for sourcename in df_by_test.groups + ] + flex = panel.FlexBox(*panes, align_items="start", justify_content="start") + tabs.append((category.title(), flex)) + doc = panel.Tabs(*tabs, margin=12) + + doc.save(out_fname, title=runtime, resources=INLINE) + + +def make_ab_html_report( + df: pandas.DataFrame, + output_dir: pathlib.Path, + by_test: bool, + baseline: str | None, +) -> None: + """Generate HTML report for the latest CI run, comparing all runtimes (e.g. + coiled-upstream-py3.9) against a baseline runtime + + Create one tab for each test category (e.g. benchmarks, runtime, stability), + one graph for each runtime and one bar for each test + OR one graph for each test and one bar for each runtime, + and one graph tab for each measure (wall clock, average memory, peak memory). + + If a baseline runtime is defined, all measures are expressed relative to the + baseline; otherwise they're expressed in absolute terms. + """ + out_fname = str( + output_dir.joinpath( + "AB_by_" + + ("test" if by_test else "runtime") + + (f"_vs_{baseline}" if baseline else "") + + ".html" + ) + ) + print(f"Generating {out_fname}") + + categories = sorted(df.category.unique()) + tabs = [] + for category in categories: + if by_test: + df_by_test = ( + df[df.category == category] + .sort_values(["sourcename", "fullname"]) + .groupby(["sourcename", "fullname"]) + ) + panes = [ + make_test_report( + df_by_test.get_group((sourcename, fullname)), + kind="barchart", + title=fullname, + sourcename=sourcename, + baseline=baseline, + ) + for sourcename, fullname in df_by_test.groups + ] + else: + df_by_runtime = ( + df[df.category == category] + .sort_values("runtime", key=runtime_sort_key_pd) + .groupby("runtime") + ) + panes = [ + make_test_report( + df_by_runtime.get_group(runtime), + kind="barchart", + title=runtime, + baseline=baseline, + ) + for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key) + if runtime != baseline + ] + flex = panel.FlexBox(*panes, align_items="start", justify_content="start") + tabs.append((category.title(), flex)) + doc = panel.Tabs(*tabs, margin=12) + + doc.save( + out_fname, + title="A/B by " + + ("test" if by_test else "runtime") + + (f" vs. {baseline}" if baseline else ""), + resources=INLINE, ) - static = pathlib.Path("static") - static.mkdir(exist_ok=True) - engine = sqlalchemy.create_engine(f"sqlite:///{DB_NAME}") + +def make_index_html_report( + output_dir: pathlib.Path, runtimes: list[str], baselines: list[str] +) -> None: + """Generate index.html""" + index_txt = """# Coiled Runtime Benchmarks\n""" + index_txt += "### Historical timeseries\n" + for runtime in runtimes: + index_txt += f"- [{runtime}](./{runtime}.html)\n" + index_txt += "\n\n### A/B tests\n" + index_txt += "- [by test](./AB_by_test.html)\n" + index_txt += "- [by runtime](./AB_by_runtime.html)\n" + for baseline in baselines: + index_txt += ( + f"- [by runtime vs. {baseline}](./AB_by_runtime_vs_{baseline}.html)\n" + ) + + index = panel.pane.Markdown(index_txt, width=800) + out_fname = str(output_dir.joinpath("index.html")) + print(f"Generating {out_fname}") + index.save( + out_fname, + title="Coiled Runtime Benchmarks", + resources=INLINE, + ) + + +def runtime_sort_key(runtime: str) -> tuple: + """Runtimes are in the format coiled--py + e.g. coiled-latest-py3.8 + + Sort them by coiled-runtime and python version, both descending. + """ + t = runtime.split("-") + assert len(t) == 3 + assert t[0] == "coiled" + # upstream > latest > 0.1.0 > 0.0.4 + if t[1] == "upstream": + coiled_version = [-2] + elif t[1] == "latest": + coiled_version = [-1] + else: + coiled_version = [0] + [-int(v) for v in t[1].split(".")] + + assert t[2][:2] == "py" + py_version = [-int(v) for v in t[2][2:].split(".")] + return coiled_version, py_version + + +def runtime_sort_key_pd(s: pandas.Series) -> pandas.Series: + return pandas.Series([runtime_sort_key(v) for v in s], index=s.index) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate a static HTML report comparing metrics from the runs" + ) + parser.add_argument( + "--db-file", + "-d", + help="Path to SQLite database file containing the metrics", + ) + parser.add_argument( + "--output-dir", + "-o", + help="Output directory", + default="build/html", + ) + parser.add_argument( + "--baseline", + "-b", + nargs="+", + default=[], + help="Baseline runtime(s) for A/B comparison", + ) + parser.add_argument( + "--pickle", + action="store_true", + help="Dump raw dataframe to pickle file", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + output_dir = pathlib.Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + load_test_source() + + # Load SQLite database into a pandas DataFrame + engine = sqlalchemy.create_engine(f"sqlite:///{args.db_file}") df = pandas.read_sql( - "select * from test_run where platform = 'linux' and call_outcome in ('passed', 'failed')", + "select * from test_run where platform = 'linux' " + "and call_outcome in ('passed', 'failed')", engine, ) df = df.assign( + start=pandas.to_datetime(df.start), + end=pandas.to_datetime(df.end), runtime=( "coiled-" + df.coiled_runtime_version @@ -163,35 +483,45 @@ def make_test_report(group_keys, df): + df.python_version.str.split(".", n=2).str[:2].str.join(".") ), category=df.path.str.split("/", n=1).str[0], + sourcename=df.path.str.cat(df.originalname, "::"), + fullname=df.path.str.cat(df.name, "::"), + fullname_no_category=df.path.str.partition("/")[2].str.cat(df.name, "::"), ) + for spec in SPECS: + df[spec.field_name] /= spec.scale + df = df.set_index("id") + + if args.pickle: + out_fname = str(output_dir.joinpath("records.pickle")) + print(f"Generating {out_fname}") + df.to_pickle(out_fname) - runtimes = list(df.runtime.unique()) + # Generate HTML pages + runtimes = sorted(df.runtime.unique(), key=runtime_sort_key) for runtime in runtimes: - print(f"Generating dashboard for {runtime}") - categories = df[df.runtime == runtime].category.unique() - tabs = [] - for category in categories: - by_test = ( - df[(df.runtime == runtime) & (df.category == category)] - .sort_values(["path", "originalname"]) - .groupby(["path", "originalname"]) - ) - panes = [ - make_test_report(test_name, by_test.get_group(test_name)) - for test_name in by_test.groups - ] - flex = panel.FlexBox(*panes, align_items="start", justify_content="start") - tabs.append((category.title(), flex)) - doc = panel.Tabs(*tabs, margin=12) + make_timeseries_html_report(df, output_dir, runtime) - doc.save( - str(static.joinpath(runtime + ".html")), title=runtime, resources=INLINE - ) - index = """# Coiled Runtime Benchmarks\n\n""" - index += "\n\n".join([f"[{r}](./{r}.html)" for r in reversed(sorted(runtimes))]) - index = panel.pane.Markdown(index, width=800) - index.save( - str(static.joinpath("index.html")), - title="Coiled Runtime Benchmarks", - resources=INLINE, - ) + # Select only the latest run for each runtime. This may pick up historical runs (up + # to 6h old) if they have not been rerun in the current pull/PR. + # TODO This is fragile. Keep the latest and historical databases separate, or record + # the coiled-runtime git hash and use it to filter? + max_end = df.sort_values("end").groupby(["runtime", "category"]).tail(1) + max_end = max_end[max_end["end"] > max_end["end"].max() - pandas.Timedelta("6h")] + session_ids = max_end["session_id"].unique() + latest_run = df[df["session_id"].isin(session_ids)] + + make_ab_html_report(latest_run, output_dir, by_test=True, baseline=None) + make_ab_html_report(latest_run, output_dir, by_test=False, baseline=None) + baselines = [] + for baseline in args.baseline: + df_baseline = align_to_baseline(latest_run, baseline) + if df_baseline is None: + continue + baselines.append(baseline) + make_ab_html_report(df_baseline, output_dir, by_test=False, baseline=baseline) + + make_index_html_report(output_dir, runtimes, baselines) + + +if __name__ == "__main__": + main()