ci(benchmarks): improve benchmark reporting

This change improves the report from benchmark runs by adding t-tests for each scenario. This allows the benchmarks to provide a summary at the end with the scenarios that are likely to show perfomance difference between the latest released version and the one that comes with the PR the benchmarks run for. This also updates the versions that are being benchmarked to include the latest releases.
P403n1x87 · Feb 27, 2023 · a1b01d9 · a1b01d9
1 parent 01a247a
commit a1b01d9
Show file tree

Hide file tree

Showing 4 changed files with 200 additions and 32 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -37,12 +37,9 @@ jobs:
           ulimit -c unlimited
 
           source .venv/bin/activate
-          python scripts/benchmark.py | tee benchmarks.txt
+          python scripts/benchmark.py --format markdown | tee comment.txt
           deactivate
 
-          # Make it a code comment
-          sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt
-
       - name: Post results on PR
         uses: marocchino/sticky-pull-request-comment@v2
         with:

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -1,14 +1,18 @@
 # Run as python3 scripts/benchmark.py from the repository root directory.
 # Ensure dependencies from requirements-bm.txt are installed.
 
+import abc
 import re
 import sys
+from textwrap import wrap
 import typing as t
 from argparse import ArgumentParser
 from itertools import product
 from math import floor, log
 from pathlib import Path
 
+from scipy.stats import ttest_ind
+
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 import tarfile
@@ -17,7 +21,7 @@
 from urllib.error import HTTPError
 from urllib.request import urlopen
 
-VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev")
+VERSIONS = ("3.5.0", "dev")  # ("3.4.1", "3.5.0", "dev")
 SCENARIOS = [
     *[
         (
@@ -62,6 +66,15 @@
 ]
 
 
+# The metrics we evaluate and whether they are to be maximised or minimised.
+METRICS = [
+    ("Sample Rate", +1),
+    ("Saturation", -1),
+    ("Error Rate", -1),
+    ("Sampling Speed", -1),
+]
+
+
 def get_stats(output: str) -> t.Optional[dict]:
     try:
         meta = metadata(output)
@@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") ->
 
 class Outcome:
     def __init__(self, data: list[float]) -> None:
+        self.data = data
         self.mean = sum(data) / len(data)
         self.stdev = (
             sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1)
@@ -140,29 +154,165 @@ def __repr__(self):
     def __len__(self):
         return len(repr(self))
 
+    def __eq__(self, other: "Outcome") -> bool:
+        t, p = ttest_ind(self.data, other.data, equal_var=False)
+        return p < 0.05
 
-def render(table):
-    _, row = table[0]
-    cols = list(row.keys())
-    max_vh = max(len(e[0]) for e in table)
 
-    col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
-    div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+Results = t.Tuple[str, t.Dict[str, Outcome]]
 
-    print("=" * div_len)
-    print(
-        (" " * (max_vh + 2))
-        + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
-    )
-    print("-" * div_len)
 
-    for v, row in table:
-        print(f"{v:^{max_vh+2}}", end="")
-        for col, cw in zip(cols, col_widths):
-            print(f"{str(row[col]):^{cw+2}}", end="")
+class Renderer(abc.ABC):
+    @abc.abstractmethod
+    def render_header(self, title: str, level: int = 1) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_paragraph(self, text: str) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_table(self, table) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_scenario(
+        self, title, results: t.List[t.Tuple[str, t.List[Results]]]
+    ) -> str:
+        ...
+
+    @abc.abstractmethod
+    def render_summary(
+        self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]]
+    ) -> str:
+        ...
+
+    def render_scenario(
+        self, title, table: t.List[t.Tuple[str, t.List[Results]]]
+    ) -> str:
+        self.render_header(title, level=2)
+        self.render_table(table)
         print()
 
-    print("=" * div_len)
+    def render_summary(self, summary):
+        self.render_header("Benchmark Summary", level=2)
+        self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.")
+
+        if not summary:
+            self.render_paragraph(
+                "No significant difference in performance between versions."
+            )
+            return
+
+        self.render_paragraph(
+            "The following scenarios show a statistically significant difference "
+            "in performance between the two version."
+        )
+
+        self.render_table(
+            [
+                (
+                    title,
+                    {
+                        m: {1: "better", -1: "worse"}[s] if c else "same"
+                        for m, c, s in tests
+                    },
+                )
+                for title, tests in summary
+            ]
+        )
+
+
+class TerminalRenderer(Renderer):
+    def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
+        _, row = table[0]
+        cols = list(row.keys())
+        max_vh = max(len(e[0]) for e in table)
+
+        col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
+        div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+
+        print("=" * div_len)
+        print(
+            (" " * (max_vh + 2))
+            + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
+        )
+        print("-" * div_len)
+
+        for v, row in table:
+            print(f"{v:^{max_vh+2}}", end="")
+            for col, cw in zip(cols, col_widths):
+                print(f"{str(row[col]):^{cw+2}}", end="")
+            print()
+
+        print("=" * div_len)
+
+    def render_header(self, title: str, level: int = 1) -> str:
+        print(title)
+        print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title))
+        print()
+
+    def render_paragraph(self, text: str) -> str:
+        for _ in wrap(text):
+            print(_)
+        print()
+
+
+class MarkdownRenderer(Renderer):
+    def render_header(self, title: str, level: int = 1) -> str:
+        print(f"{'#' * level} {title}")
+        print()
+
+    def render_paragraph(self, text: str) -> str:
+        for _ in wrap(text):
+            print(_)
+        print()
+
+    def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
+        _, row = table[0]
+        cols = list(row.keys())
+        max_vh = max(len(e[0]) for e in table)
+
+        col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
+        div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+
+        print("|     |" + "|".join(f" {col} " for col in cols) + "|")
+        print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|")
+
+        for v, row in table:
+            print(
+                f"| {v} |"
+                + "|".join(
+                    f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths)
+                )
+                + "|"
+            )
+
+    def render_scenario(
+        self, title, table: t.List[t.Tuple[str, t.List[Results]]]
+    ) -> str:
+        print("<details>")
+        print(f"<summary><strong>{title}</strong></summary>")
+        print()
+        super().render_scenario(title, table)
+        print("</details>")
+
+
+def summarize(results: t.List[t.Tuple[str, t.List[Results]]]):
+    summary = []
+    for title, table in results:
+        (_, a), (_, b) = table[-2:]
+        tests = [
+            (
+                m,
+                a[m] == b[m],
+                int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)),
+            )
+            for m, s in METRICS
+        ]
+        if any(c for _, c, _ in tests):
+            summary.append((title, tests))
+    return summary
 
 
 def main():
@@ -181,26 +331,41 @@ def main():
         help="Number of times to run each scenario",
     )
 
+    argp.add_argument(
+        "-f",
+        "--format",
+        type=str,
+        choices=["terminal", "markdown"],
+        default="terminal",
+        help="The output format",
+    )
+
     opts = argp.parse_args()
 
-    print(
+    renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[
+        opts.format
+    ]()
+
+    renderer.render_header("Austin Benchmarks")
+    renderer.render_paragraph(
         f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}",
-        end="\n\n",
     )
 
+    results: t.List[t.Tuple[str, t.List[Results]]] = []
+
     for variant, title, args in SCENARIOS:
         if opts.k is not None and not opts.k.match(title):
             continue
 
-        print(title)
-
-        table = []
+        table: t.List[Results] = []
         for version in VERSIONS:
-            print(f"> Running with Austin {version} ...    ", end="\r")
+            print(f"> Running with Austin {version} ...    ", end="\r", file=sys.stderr)
             try:
                 austin = download_release(version, Path("/tmp"), variant_name=variant)
             except RuntimeError:
-                print(f"WARNING: Could not download {variant} {version}")
+                print(
+                    f"WARNING: Could not download {variant} {version}", file=sys.stderr
+                )
                 continue
 
             stats = [
@@ -218,8 +383,13 @@ def main():
                 )
             )
 
-        render(table)
-        print()
+        renderer.render_scenario(title, table)
+
+        results.append((title, table))
+
+    summary = summarize(results)
+
+    renderer.render_summary(summary)
 
 
 if __name__ == "__main__":

diff --git a/scripts/requirements-bm.txt b/scripts/requirements-bm.txt
@@ -1 +1,2 @@
 austin-python~=1.4.1
+scipy~=1.10.1
diff --git a/test/utils.py b/test/utils.py
@@ -170,7 +170,7 @@ def __call__(
             # or using the "where" option.
             result.stdout = demojo(result.stdout)
         else:
-            result.stdout = result.stdout.decode()
+            result.stdout = result.stdout.decode(errors="ignore")
         result.stderr = result.stderr.decode()
 
         return result