diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 81b6095712..5195d69cd2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -57,16 +57,19 @@ jobs:
           if [[ "$COMMIT" == *"test-upstream"* || ${{ github.event_name }} == "schedule" ]]
           then
             export TEST_UPSTREAM="true"
+            export AB_BASELINE="coiled-upstream-py3.9 coiled-0.1.0-py3.9"
           else
             export TEST_UPSTREAM="false"
+            export AB_BASELINE="coiled-latest-py3.9 coiled-0.1.0-py3.9"
           fi
 
           # Put TEST_UPSTREAM into $GITHUB_ENV so it can be used in subsequent workflow steps
           echo $TEST_UPSTREAM
           echo TEST_UPSTREAM=$TEST_UPSTREAM >> $GITHUB_ENV
 
-          # Put TEST_UPSTREAM into a file so it can be downloaded in subsequent workflow jobs
+          # Put env variables into files so it can be downloaded in subsequent workflow jobs
           echo $TEST_UPSTREAM > test_upstream.txt
+          echo $AB_BASELINE > ab_baseline.txt
 
       - name: Build Coiled Software Environment
         env:
@@ -104,6 +107,7 @@ jobs:
             latest.yaml
             software_name.txt
             test_upstream.txt
+            ab_baseline.txt
 
   runtime:
     name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }}
@@ -437,7 +441,7 @@ jobs:
   static-site:
     needs: process-results
     # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run)
-    if: always() && github.ref == 'refs/heads/main' && github.repository == 'coiled/coiled-runtime'
+    if: always()
     name: Build static dashboards
     runs-on: ubuntu-latest
     steps:
@@ -457,12 +461,24 @@ jobs:
           python-version: "3.9"
           environment-file: ci/environment-dashboard.yml
 
+      - name: Download software environment assets
+        uses: actions/download-artifact@v3
+        with:
+          name: software-environment-py3.9
+
       - name: Generate dashboards
         run: |
-          python dashboard.py
+          python dashboard.py -d benchmark.db -o static -b $(cat ab_baseline.txt)
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: static-dashboard
+          path: static
 
       - name: Deploy 🚀
         uses: JamesIves/github-pages-deploy-action@4.1.7
+        if:  github.ref == 'refs/heads/main' && github.repository == 'coiled/coiled-runtime'
         with:
           branch: gh-pages
           folder: static
diff --git a/.gitignore b/.gitignore
index c3d511b0c9..272b49eb25 100644
--- a/.gitignore
+++ b/.gitignore
@@ -120,6 +120,9 @@ venv.bak/
 # Rope project settings
 .ropeproject
 
+# PyCharm project settings
+.idea
+
 # mkdocs documentation
 /site
 
diff --git a/ci/environment-dashboard.yml b/ci/environment-dashboard.yml
index bf298d61e6..4ebdbd3ca1 100644
--- a/ci/environment-dashboard.yml
+++ b/ci/environment-dashboard.yml
@@ -11,6 +11,7 @@ dependencies:
   - dask
   - dask-ml
   - distributed
+  - xarray
   - xgboost
   - pandas
   - tabulate
diff --git a/dashboard.py b/dashboard.py
index 42b03d91ff..78b0669248 100644
--- a/dashboard.py
+++ b/dashboard.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
-import collections
+import argparse
 import glob
 import importlib
 import inspect
-import os
 import pathlib
-import sys
+from typing import Literal, NamedTuple
 
 import altair
 import pandas
@@ -17,62 +16,181 @@
 panel.extension("vega")
 
 
-def get_test_source():
-    """
-    Crawl the tests directory and try to grab code for each test on a best-effort
-    basis. This relies on the tests being importable from this script, so the
-    environment should be similar enough that that is possible.
+class ChartSpec(NamedTuple):
+    field_name: str
+    field_desc: str
+    unit: str
+    scale: float
+
+
+SPECS = [
+    ChartSpec("duration", "Wall Clock", "(s)", 1),
+    ChartSpec("average_memory", "Average Memory", "(GiB)", 2**30),
+    ChartSpec("peak_memory", "Peak Memory", "(GiB)", 2**30),
+]
+
+
+source: dict[str, str] = {}
+
+
+def load_test_source() -> None:
+    """Crawl the tests directory and try to grab code for each test. This relies on the
+    tests being importable from this script.
     """
-    source: dict[str, str] = {}
-    files = glob.glob("tests/**/test_*.py", recursive=True)
-    for f in files:
+    for fname in glob.iglob("tests/**/test_*.py", recursive=True):
         try:
-            # Fragile!
-            mod = importlib.import_module(f.replace("/", ".")[: -len(".py")])
-            tests = [a for a in dir(mod) if a.startswith("test_")]
-            for test in tests:
-                if fn := getattr(mod, test, None):
-                    if not callable(fn):
-                        continue
-                    source[f[len("tests/") :] + "::" + test] = inspect.getsource(fn)
-        except BaseException:  # Some pytest exceptions inherit directly from BaseException
-            pass
-    return source
+            mod = importlib.import_module(fname.replace("/", ".")[: -len(".py")])
+        # Some pytest exceptions inherit directly from BaseException
+        except BaseException as e:
+            print(f"Could not import {fname}: {e.__class__.__name__}: {e}")
+            continue
+        tests = [a for a in dir(mod) if a.startswith("test_")]
+        for test in tests:
+            if (func := getattr(mod, test, None)) and callable(func):
+                # FIXME missing decorators, namely @pytest.mark.parametrize
+                source[fname[len("tests/") :] + "::" + test] = inspect.getsource(func)
+
+    print(f"Discovered {len(source)} tests")
 
 
-source = get_test_source()
+def align_to_baseline(df: pandas.DataFrame, baseline: str) -> pandas.DataFrame | None:
+    """Add columns
 
+    - duration_baseline
+    - average_memory_baseline
+    - peak_memory_baseline
+    - duration_delta (A/B - 1)
+    - average_memory_delta (A/B - 1)
+    - peak_memory_delta (A/B - 1)
 
-def make_timeseries(originalname, df, spec) -> altair.Chart | None:
+    Baseline values are from the matching rows given the same test name and the baseline
+    runtime. Note that this means that df is expected to have exactly 1 test in the
+    baseline runtime for each test in every other runtime.
     """
-    Make a single timeseries altair chart for a given test.
+    df_baseline = df[df["runtime"] == baseline]
 
-    originalname: str
-        The name of the test without any fixture or other modifications.
+    if df_baseline.empty:
+        # Typically a misspelling. However, this can legitimately happen in CI if all
+        # three jobs of the baseline runtime failed early.
+        print(
+            f"Baseline runtime {baseline!r} not found; valid choices are:",
+            ", ".join(df["runtime"].unique()),
+        )
+        return None
 
-    df: pandas.DataFrame
-        A dataframe with the test data in it.
+    baseline_names = df_baseline["fullname"].unique()
+    all_names = df["fullname"].unique()
 
-    spec: ChartSpec
-        Data for how to render the timeseries
-    """
-    df = df.dropna(subset=[spec.field, "start"])
+    assert len(baseline_names) == df_baseline.shape[0]
+    if len(baseline_names) < len(all_names):
+        # This will happen in CI if one or two out of three jobs of the baseline failed.
+        # Note that df contains the latest run only. It means that tests on all runtimes
+        # (including historical ones) should be from the coiled-runtime git tip, so
+        # adding or removing tests should not cause a mismatch.
+        print(
+            f"Baseline runtime {baseline!r} is missing some tests:",
+            ", ".join(set(all_names) - set(baseline_names)),
+        )
+        return None
+
+    columns = [spec.field_name for spec in SPECS]
+    df_baseline = (
+        df_baseline.set_index("fullname")
+        .loc[df["fullname"], columns]
+        .rename(columns={k: k + "_baseline" for k in columns})
+    )
+    df_baseline.index = df.index
+    df = pandas.concat([df, df_baseline], axis=1)
+    for column in columns:
+        df[column + "_delta"] = (df[column] / df[column + "_baseline"] - 1) * 100
+    return df
+
+
+def make_barchart(
+    df: pandas.DataFrame,
+    spec: ChartSpec,
+    title: str,
+    baseline: str | None,
+) -> altair.Chart | None:
+    """Make a single Altair barchart for a given test or runtime"""
+    df = df.dropna(subset=[spec.field_name, "start"])
     if not len(df):
+        # Some tests do not have average_memory or peak_memory measures, only runtime
         return None
-    df = df.assign(**{spec.field: df[spec.field] / spec.scale})
+
+    fields = [
+        spec.field_name,
+        "fullname",
+        "fullname_no_category",
+        "dask_version",
+        "distributed_version",
+        "runtime",
+    ]
+
+    height = df.shape[0] * 20 + 50
+    tooltip = [
+        altair.Tooltip("fullname:N", title="Test"),
+        altair.Tooltip("dask_version:N", title="Dask"),
+        altair.Tooltip("distributed_version:N", title="Distributed"),
+        altair.Tooltip(f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"),
+    ]
+
+    by_test = len(df["fullname"].unique()) == 1
+    if by_test:
+        df = df.sort_values("runtime", key=runtime_sort_key_pd)
+        y = altair.Y("runtime", title="Runtime", sort=None)
+    else:
+        y = altair.Y("fullname_no_category", title="Test name")
+
+    if baseline:
+        fields += [
+            f"{spec.field_name}_delta",
+            f"{spec.field_name}_baseline",
+        ]
+        x = altair.X(
+            f"{spec.field_name}_delta",
+            title=f"{spec.field_desc} (delta % from {baseline})",
+        )
+        tooltip += [
+            altair.Tooltip(
+                f"{spec.field_name}_baseline:Q", title=f"{baseline} {spec.unit}"
+            ),
+            altair.Tooltip(f"{spec.field_name}_delta:Q", title="Delta %"),
+        ]
+    else:
+        x = altair.X(spec.field_name, title=f"{spec.field_desc} {spec.unit}")
+
+    return (
+        altair.Chart(df[fields], width=800, height=height)
+        .mark_bar()
+        .encode(x=x, y=y, tooltip=tooltip)
+        .properties(title=title)
+        .configure(autosize="fit")
+    )
+
+
+def make_timeseries(
+    df: pandas.DataFrame, spec: ChartSpec, title: str
+) -> altair.Chart | None:
+    """Make a single Altair timeseries chart for a given test"""
+    df = df.dropna(subset=[spec.field_name, "start"])
+    if not len(df):
+        # Some tests do not have average_memory or peak_memory measures, only runtime
+        return None
+
     df = df.fillna({"ci_run_url": "https://github.com/coiled/coiled-runtime"})
-    path = df.path.iloc[0]
     kwargs = {}
     # Reduce the size of the altair spec
     df = df[
         [
-            spec.field,
+            spec.field_name,
             "start",
             "ci_run_url",
             "name",
             "call_outcome",
             "coiled_runtime_version",
             "dask_version",
+            "distributed_version",
         ]
     ]
     if len(df.name.unique()) > 1:
@@ -88,7 +206,7 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None:
         .mark_line(point=altair.OverlayMarkDef(size=64))
         .encode(
             x=altair.X("start:T"),
-            y=altair.Y(f"{spec.field}:Q", title=spec.label),
+            y=altair.Y(f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"),
             href=altair.Href("ci_run_url:N"),
             tooltip=[
                 altair.Tooltip("name:N", title="Test Name"),
@@ -96,66 +214,268 @@ def make_timeseries(originalname, df, spec) -> altair.Chart | None:
                 altair.Tooltip("call_outcome:N", title="Test Outcome"),
                 altair.Tooltip("coiled_runtime_version:N", title="Coiled Runtime"),
                 altair.Tooltip("dask_version:N", title="Dask"),
-                altair.Tooltip(f"{spec.field}:Q", title=spec.label),
+                altair.Tooltip("distributed_version:N", title="Distributed"),
+                altair.Tooltip(
+                    f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"
+                ),
                 altair.Tooltip("ci_run_url:N", title="CI Run URL"),
             ],
             **kwargs,
         )
-        .properties(title=f"{path}::{originalname}")
+        .properties(title=title)
         .configure(autosize="fit")
     )
 
 
-def make_test_report(group_keys, df):
-    """
-    Make a tab panel for a single test.
-
-    originalname: str
-        The name of the test without any fixture or other modifications.
-
-    df: pandas.DataFrame
-        A dataframe with the test data in it.
-    """
-    path, originalname = group_keys
-
-    ChartSpec = collections.namedtuple("ChartSpec", ["field", "scale", "label"])
-    specs = [
-        ChartSpec("duration", 1, "Wall Clock (s)"),
-        ChartSpec("average_memory", 1024**3, "Average Memory (GiB)"),
-        ChartSpec("peak_memory", 1024**3, "Peak Memory (GiB)"),
-    ]
+def make_test_report(
+    df: pandas.DataFrame,
+    kind: Literal["barchart" | "timeseries"],
+    title: str,
+    sourcename: str | None = None,
+    baseline: str | None = None,
+) -> panel.Tabs:
+    """Make a tab panel for a single test"""
     tabs = []
-    for s in specs:
-        chart = make_timeseries(originalname, df, s)
+    for spec in SPECS:
+        if kind == "timeseries":
+            assert not baseline
+            chart = make_timeseries(df, spec, title)
+        else:
+            chart = make_barchart(df, spec, title, baseline)
         if not chart:
             continue
-        tabs.append((s.label, chart))
+        tabs.append((spec.field_desc, chart))
+
+    if kind == "timeseries":
+        height = 384
+    else:
+        height = df.shape[0] * 20 + 50
 
-    sourcename = path + "::" + originalname
     if sourcename in source:
         code = panel.pane.Markdown(
             f"```python\n{source[sourcename]}\n```",
             width=800,
-            height=384,
+            height=height,
             style={"overflow": "auto"},
         )
         tabs.append(("Source", code))
+    elif sourcename is not None:
+        print("Source code not found for", sourcename)
+
     return panel.Tabs(*tabs, margin=12, width=800)
 
 
-if __name__ == "__main__":
-    DB_NAME = (
-        sys.argv[1] if len(sys.argv) > 1 else os.environ.get("DB_NAME", "benchmark.db")
+def make_timeseries_html_report(
+    df: pandas.DataFrame, output_dir: pathlib.Path, runtime: str
+) -> None:
+    """Generate HTML report for one runtime (e.g. coiled-upstream-py3.9), showing
+    evolution of measures (wall clock, average memory, peak memory) over historical CI
+    runs.
+
+    Create one tab for each test category (e.g. benchmarks, runtime, stability),
+    one graph for each test,
+    and one graph tab for each measure (wall clock, average memory, peak memory).
+    """
+    out_fname = str(output_dir.joinpath(runtime + ".html"))
+    print(f"Generating {out_fname}")
+    categories = sorted(df[df.runtime == runtime].category.unique())
+    tabs = []
+    for category in categories:
+        df_by_test = (
+            df[(df.runtime == runtime) & (df.category == category)]
+            .sort_values("sourcename")
+            .groupby("sourcename")
+        )
+        panes = [
+            make_test_report(
+                df_by_test.get_group(sourcename),
+                kind="timeseries",
+                title=sourcename,
+                sourcename=sourcename,
+            )
+            for sourcename in df_by_test.groups
+        ]
+        flex = panel.FlexBox(*panes, align_items="start", justify_content="start")
+        tabs.append((category.title(), flex))
+    doc = panel.Tabs(*tabs, margin=12)
+
+    doc.save(out_fname, title=runtime, resources=INLINE)
+
+
+def make_ab_html_report(
+    df: pandas.DataFrame,
+    output_dir: pathlib.Path,
+    by_test: bool,
+    baseline: str | None,
+) -> None:
+    """Generate HTML report for the latest CI run, comparing all runtimes (e.g.
+    coiled-upstream-py3.9) against a baseline runtime
+
+    Create one tab for each test category (e.g. benchmarks, runtime, stability),
+    one graph for each runtime and one bar for each test
+    OR one graph for each test and one bar for each runtime,
+    and one graph tab for each measure (wall clock, average memory, peak memory).
+
+    If a baseline runtime is defined, all measures are expressed relative to the
+    baseline; otherwise they're expressed in absolute terms.
+    """
+    out_fname = str(
+        output_dir.joinpath(
+            "AB_by_"
+            + ("test" if by_test else "runtime")
+            + (f"_vs_{baseline}" if baseline else "")
+            + ".html"
+        )
+    )
+    print(f"Generating {out_fname}")
+
+    categories = sorted(df.category.unique())
+    tabs = []
+    for category in categories:
+        if by_test:
+            df_by_test = (
+                df[df.category == category]
+                .sort_values(["sourcename", "fullname"])
+                .groupby(["sourcename", "fullname"])
+            )
+            panes = [
+                make_test_report(
+                    df_by_test.get_group((sourcename, fullname)),
+                    kind="barchart",
+                    title=fullname,
+                    sourcename=sourcename,
+                    baseline=baseline,
+                )
+                for sourcename, fullname in df_by_test.groups
+            ]
+        else:
+            df_by_runtime = (
+                df[df.category == category]
+                .sort_values("runtime", key=runtime_sort_key_pd)
+                .groupby("runtime")
+            )
+            panes = [
+                make_test_report(
+                    df_by_runtime.get_group(runtime),
+                    kind="barchart",
+                    title=runtime,
+                    baseline=baseline,
+                )
+                for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key)
+                if runtime != baseline
+            ]
+        flex = panel.FlexBox(*panes, align_items="start", justify_content="start")
+        tabs.append((category.title(), flex))
+    doc = panel.Tabs(*tabs, margin=12)
+
+    doc.save(
+        out_fname,
+        title="A/B by "
+        + ("test" if by_test else "runtime")
+        + (f" vs. {baseline}" if baseline else ""),
+        resources=INLINE,
     )
-    static = pathlib.Path("static")
-    static.mkdir(exist_ok=True)
 
-    engine = sqlalchemy.create_engine(f"sqlite:///{DB_NAME}")
+
+def make_index_html_report(
+    output_dir: pathlib.Path, runtimes: list[str], baselines: list[str]
+) -> None:
+    """Generate index.html"""
+    index_txt = """# Coiled Runtime Benchmarks\n"""
+    index_txt += "### Historical timeseries\n"
+    for runtime in runtimes:
+        index_txt += f"- [{runtime}](./{runtime}.html)\n"
+    index_txt += "\n\n### A/B tests\n"
+    index_txt += "- [by test](./AB_by_test.html)\n"
+    index_txt += "- [by runtime](./AB_by_runtime.html)\n"
+    for baseline in baselines:
+        index_txt += (
+            f"- [by runtime vs. {baseline}](./AB_by_runtime_vs_{baseline}.html)\n"
+        )
+
+    index = panel.pane.Markdown(index_txt, width=800)
+    out_fname = str(output_dir.joinpath("index.html"))
+    print(f"Generating {out_fname}")
+    index.save(
+        out_fname,
+        title="Coiled Runtime Benchmarks",
+        resources=INLINE,
+    )
+
+
+def runtime_sort_key(runtime: str) -> tuple:
+    """Runtimes are in the format coiled-<coiled-runtime version>-py<python version>
+    e.g. coiled-latest-py3.8
+
+    Sort them by coiled-runtime and python version, both descending.
+    """
+    t = runtime.split("-")
+    assert len(t) == 3
+    assert t[0] == "coiled"
+    # upstream > latest > 0.1.0 > 0.0.4
+    if t[1] == "upstream":
+        coiled_version = [-2]
+    elif t[1] == "latest":
+        coiled_version = [-1]
+    else:
+        coiled_version = [0] + [-int(v) for v in t[1].split(".")]
+
+    assert t[2][:2] == "py"
+    py_version = [-int(v) for v in t[2][2:].split(".")]
+    return coiled_version, py_version
+
+
+def runtime_sort_key_pd(s: pandas.Series) -> pandas.Series:
+    return pandas.Series([runtime_sort_key(v) for v in s], index=s.index)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate a static HTML report comparing metrics from the runs"
+    )
+    parser.add_argument(
+        "--db-file",
+        "-d",
+        help="Path to SQLite database file containing the metrics",
+    )
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        help="Output directory",
+        default="build/html",
+    )
+    parser.add_argument(
+        "--baseline",
+        "-b",
+        nargs="+",
+        default=[],
+        help="Baseline runtime(s) for A/B comparison",
+    )
+    parser.add_argument(
+        "--pickle",
+        action="store_true",
+        help="Dump raw dataframe to pickle file",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    output_dir = pathlib.Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    load_test_source()
+
+    # Load SQLite database into a pandas DataFrame
+    engine = sqlalchemy.create_engine(f"sqlite:///{args.db_file}")
     df = pandas.read_sql(
-        "select * from test_run where platform = 'linux' and call_outcome in ('passed', 'failed')",
+        "select * from test_run where platform = 'linux' "
+        "and call_outcome in ('passed', 'failed')",
         engine,
     )
     df = df.assign(
+        start=pandas.to_datetime(df.start),
+        end=pandas.to_datetime(df.end),
         runtime=(
             "coiled-"
             + df.coiled_runtime_version
@@ -163,35 +483,45 @@ def make_test_report(group_keys, df):
             + df.python_version.str.split(".", n=2).str[:2].str.join(".")
         ),
         category=df.path.str.split("/", n=1).str[0],
+        sourcename=df.path.str.cat(df.originalname, "::"),
+        fullname=df.path.str.cat(df.name, "::"),
+        fullname_no_category=df.path.str.partition("/")[2].str.cat(df.name, "::"),
     )
+    for spec in SPECS:
+        df[spec.field_name] /= spec.scale
+    df = df.set_index("id")
+
+    if args.pickle:
+        out_fname = str(output_dir.joinpath("records.pickle"))
+        print(f"Generating {out_fname}")
+        df.to_pickle(out_fname)
 
-    runtimes = list(df.runtime.unique())
+    # Generate HTML pages
+    runtimes = sorted(df.runtime.unique(), key=runtime_sort_key)
     for runtime in runtimes:
-        print(f"Generating dashboard for {runtime}")
-        categories = df[df.runtime == runtime].category.unique()
-        tabs = []
-        for category in categories:
-            by_test = (
-                df[(df.runtime == runtime) & (df.category == category)]
-                .sort_values(["path", "originalname"])
-                .groupby(["path", "originalname"])
-            )
-            panes = [
-                make_test_report(test_name, by_test.get_group(test_name))
-                for test_name in by_test.groups
-            ]
-            flex = panel.FlexBox(*panes, align_items="start", justify_content="start")
-            tabs.append((category.title(), flex))
-        doc = panel.Tabs(*tabs, margin=12)
+        make_timeseries_html_report(df, output_dir, runtime)
 
-        doc.save(
-            str(static.joinpath(runtime + ".html")), title=runtime, resources=INLINE
-        )
-    index = """# Coiled Runtime Benchmarks\n\n"""
-    index += "\n\n".join([f"[{r}](./{r}.html)" for r in reversed(sorted(runtimes))])
-    index = panel.pane.Markdown(index, width=800)
-    index.save(
-        str(static.joinpath("index.html")),
-        title="Coiled Runtime Benchmarks",
-        resources=INLINE,
-    )
+    # Select only the latest run for each runtime. This may pick up historical runs (up
+    # to 6h old) if they have not been rerun in the current pull/PR.
+    # TODO This is fragile. Keep the latest and historical databases separate, or record
+    #      the coiled-runtime git hash and use it to filter?
+    max_end = df.sort_values("end").groupby(["runtime", "category"]).tail(1)
+    max_end = max_end[max_end["end"] > max_end["end"].max() - pandas.Timedelta("6h")]
+    session_ids = max_end["session_id"].unique()
+    latest_run = df[df["session_id"].isin(session_ids)]
+
+    make_ab_html_report(latest_run, output_dir, by_test=True, baseline=None)
+    make_ab_html_report(latest_run, output_dir, by_test=False, baseline=None)
+    baselines = []
+    for baseline in args.baseline:
+        df_baseline = align_to_baseline(latest_run, baseline)
+        if df_baseline is None:
+            continue
+        baselines.append(baseline)
+        make_ab_html_report(df_baseline, output_dir, by_test=False, baseline=baseline)
+
+    make_index_html_report(output_dir, runtimes, baselines)
+
+
+if __name__ == "__main__":
+    main()