diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 3e1c383e..5fdcc65d 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -37,12 +37,9 @@ jobs: ulimit -c unlimited source .venv/bin/activate - python scripts/benchmark.py | tee benchmarks.txt + python scripts/benchmark.py --format markdown | tee comment.txt deactivate - # Make it a code comment - sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt - - name: Post results on PR uses: marocchino/sticky-pull-request-comment@v2 with: diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 196b91b4..5f7095a6 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -1,14 +1,18 @@ # Run as python3 scripts/benchmark.py from the repository root directory. # Ensure dependencies from requirements-bm.txt are installed. +import abc import re import sys +from textwrap import wrap import typing as t from argparse import ArgumentParser from itertools import product from math import floor, log from pathlib import Path +from scipy.stats import ttest_ind + sys.path.insert(0, str(Path(__file__).parent.parent)) import tarfile @@ -17,7 +21,7 @@ from urllib.error import HTTPError from urllib.request import urlopen -VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev") +VERSIONS = ("3.4.1", "3.5.0", "dev") SCENARIOS = [ *[ ( @@ -62,6 +66,15 @@ ] +# The metrics we evaluate and whether they are to be maximised or minimised. +METRICS = [ + ("Sample Rate", +1), + ("Saturation", -1), + ("Error Rate", -1), + ("Sampling Speed", -1), +] + + def get_stats(output: str) -> t.Optional[dict]: try: meta = metadata(output) @@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") -> class Outcome: def __init__(self, data: list[float]) -> None: + self.data = data self.mean = sum(data) / len(data) self.stdev = ( sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1) @@ -140,29 +154,173 @@ def __repr__(self): def __len__(self): return len(repr(self)) + def __eq__(self, other: "Outcome") -> bool: + t, p = ttest_ind(self.data, other.data, equal_var=False) + return p < 0.05 -def render(table): - _, row = table[0] - cols = list(row.keys()) - max_vh = max(len(e[0]) for e in table) - col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols] - div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh +Results = t.Tuple[str, t.Dict[str, Outcome]] - print("=" * div_len) - print( - (" " * (max_vh + 2)) - + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths)) - ) - print("-" * div_len) - for v, row in table: - print(f"{v:^{max_vh+2}}", end="") - for col, cw in zip(cols, col_widths): - print(f"{str(row[col]):^{cw+2}}", end="") +class Renderer(abc.ABC): + BETTER = "better" + WORSE = "worse" + SAME = "same" + + @abc.abstractmethod + def render_header(self, title: str, level: int = 1) -> str: + ... + + @abc.abstractmethod + def render_paragraph(self, text: str) -> str: + ... + + @abc.abstractmethod + def render_table(self, table) -> str: + ... + + @abc.abstractmethod + def render_scenario( + self, title, results: t.List[t.Tuple[str, t.List[Results]]] + ) -> str: + ... + + @abc.abstractmethod + def render_summary( + self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]] + ) -> str: + ... + + def render_scenario( + self, title, table: t.List[t.Tuple[str, t.List[Results]]] + ) -> str: + self.render_header(title, level=2) + self.render_table(table) + print() + + def render_summary(self, summary): + self.render_header("Benchmark Summary", level=2) + self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.") + + if not summary: + self.render_paragraph( + "No significant difference in performance between versions." + ) + return + + self.render_paragraph( + "The following scenarios show a statistically significant difference " + "in performance between the two versions." + ) + + self.render_table( + [ + ( + title, + { + m: {1: self.BETTER, -1: self.WORSE}[s] if c else self.SAME + for m, c, s in tests + }, + ) + for title, tests in summary + ] + ) + + +class TerminalRenderer(Renderer): + def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str: + _, row = table[0] + cols = list(row.keys()) + max_vh = max(len(e[0]) for e in table) + + col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols] + div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh + + print("=" * div_len) + print( + (" " * (max_vh + 2)) + + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths)) + ) + print("-" * div_len) + + for v, row in table: + print(f"{v:^{max_vh+2}}", end="") + for col, cw in zip(cols, col_widths): + print(f"{str(row[col]):^{cw+2}}", end="") + print() + + print("=" * div_len) + + def render_header(self, title: str, level: int = 1) -> str: + print(title) + print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title)) + print() + + def render_paragraph(self, text: str) -> str: + for _ in wrap(text): + print(_) + print() + + +class MarkdownRenderer(Renderer): + BETTER = ":green_circle:" + WORSE = ":red_circle:" + SAME = ":yellow_circle:" + + def render_header(self, title: str, level: int = 1) -> str: + print(f"{'#' * level} {title}") + print() + + def render_paragraph(self, text: str) -> str: + print(text) + print() + + def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str: + _, row = table[0] + cols = list(row.keys()) + max_vh = max(len(e[0]) for e in table) + + col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols] + div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh + + print("| |" + "|".join(f" {col} " for col in cols) + "|") + print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|") + + for v, row in table: + print( + f"| {v} |" + + "|".join( + f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths) + ) + + "|" + ) + + def render_scenario( + self, title, table: t.List[t.Tuple[str, t.List[Results]]] + ) -> str: + print("
") + print(f"{title}") + print() + super().render_scenario(title, table) + print("
") print() - print("=" * div_len) + +def summarize(results: t.List[t.Tuple[str, t.List[Results]]]): + summary = [] + for title, table in results: + (_, a), (_, b) = table[-2:] + tests = [ + ( + m, + a[m] == b[m], + int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)), + ) + for m, s in METRICS + ] + if any(c for _, c, _ in tests): + summary.append((title, tests)) + return summary def main(): @@ -181,26 +339,41 @@ def main(): help="Number of times to run each scenario", ) + argp.add_argument( + "-f", + "--format", + type=str, + choices=["terminal", "markdown"], + default="terminal", + help="The output format", + ) + opts = argp.parse_args() - print( + renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[ + opts.format + ]() + + renderer.render_header("Austin Benchmarks") + renderer.render_paragraph( f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}", - end="\n\n", ) + results: t.List[t.Tuple[str, t.List[Results]]] = [] + for variant, title, args in SCENARIOS: if opts.k is not None and not opts.k.match(title): continue - print(title) - - table = [] + table: t.List[Results] = [] for version in VERSIONS: - print(f"> Running with Austin {version} ... ", end="\r") + print(f"> Running with Austin {version} ... ", end="\r", file=sys.stderr) try: austin = download_release(version, Path("/tmp"), variant_name=variant) except RuntimeError: - print(f"WARNING: Could not download {variant} {version}") + print( + f"WARNING: Could not download {variant} {version}", file=sys.stderr + ) continue stats = [ @@ -218,8 +391,13 @@ def main(): ) ) - render(table) - print() + renderer.render_scenario(title, table) + + results.append((title, table)) + + summary = summarize(results) + + renderer.render_summary(summary) if __name__ == "__main__": diff --git a/scripts/requirements-bm.txt b/scripts/requirements-bm.txt index edc3cd39..064d02d2 100644 --- a/scripts/requirements-bm.txt +++ b/scripts/requirements-bm.txt @@ -1 +1,2 @@ austin-python~=1.4.1 +scipy~=1.10.1 diff --git a/test/utils.py b/test/utils.py index b1901bda..7ef9b697 100644 --- a/test/utils.py +++ b/test/utils.py @@ -170,7 +170,7 @@ def __call__( # or using the "where" option. result.stdout = demojo(result.stdout) else: - result.stdout = result.stdout.decode() + result.stdout = result.stdout.decode(errors="ignore") result.stderr = result.stderr.decode() return result