P403n1x87 · P403n1x87 · Feb 28, 2023 · Feb 26, 2023
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -37,12 +37,9 @@ jobs:
  ulimit -c unlimited
 
  source .venv/bin/activate
- python scripts/benchmark.py | tee benchmarks.txt
+ python scripts/benchmark.py --format markdown | tee comment.txt
  deactivate
 
- # Make it a code comment
- sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt
-
  - name: Post results on PR
  uses: marocchino/sticky-pull-request-comment@v2
  with:

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -1,14 +1,18 @@
 # Run as python3 scripts/benchmark.py from the repository root directory.
 # Ensure dependencies from requirements-bm.txt are installed.
 
+import abc
 import re
 import sys
+from textwrap import wrap
 import typing as t
 from argparse import ArgumentParser
 from itertools import product
 from math import floor, log
 from pathlib import Path
 
+from scipy.stats import ttest_ind
+
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 import tarfile
@@ -17,7 +21,7 @@
 from urllib.error import HTTPError
 from urllib.request import urlopen
 
-VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev")
+VERSIONS = ("3.4.1", "3.5.0", "dev")
 SCENARIOS = [
  *[
  (
@@ -62,6 +66,15 @@
 ]
 
 
+# The metrics we evaluate and whether they are to be maximised or minimised.
+METRICS = [
+ ("Sample Rate", +1),
+ ("Saturation", -1),
+ ("Error Rate", -1),
+ ("Sampling Speed", -1),
+]
+
+
 def get_stats(output: str) -> t.Optional[dict]:
  try:
  meta = metadata(output)
@@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") ->
 
 class Outcome:
  def __init__(self, data: list[float]) -> None:
+ self.data = data
  self.mean = sum(data) / len(data)
  self.stdev = (
  sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1)
@@ -140,29 +154,173 @@ def __repr__(self):
  def __len__(self):
  return len(repr(self))
 
+ def __eq__(self, other: "Outcome") -> bool:
+ t, p = ttest_ind(self.data, other.data, equal_var=False)
+ return p < 0.05
 
-def render(table):
- _, row = table[0]
- cols = list(row.keys())
- max_vh = max(len(e[0]) for e in table)
 
- col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
- div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+Results = t.Tuple[str, t.Dict[str, Outcome]]
 
- print("=" * div_len)
- print(
- (" " * (max_vh + 2))
- + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
- )
- print("-" * div_len)
 
- for v, row in table:
- print(f"{v:^{max_vh+2}}", end="")
- for col, cw in zip(cols, col_widths):
- print(f"{str(row[col]):^{cw+2}}", end="")
+class Renderer(abc.ABC):
+ BETTER = "better"
+ WORSE = "worse"
+ SAME = "same"
+
+ @abc.abstractmethod
+ def render_header(self, title: str, level: int = 1) -> str:
+ ...
+
+ @abc.abstractmethod
+ def render_paragraph(self, text: str) -> str:
+ ...
+
+ @abc.abstractmethod
+ def render_table(self, table) -> str:
+ ...
+
+ @abc.abstractmethod
+ def render_scenario(
+ self, title, results: t.List[t.Tuple[str, t.List[Results]]]
+ ) -> str:
+ ...
+
+ @abc.abstractmethod
+ def render_summary(
+ self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]]
+ ) -> str:
+ ...
+
+ def render_scenario(
+ self, title, table: t.List[t.Tuple[str, t.List[Results]]]
+ ) -> str:
+ self.render_header(title, level=2)
+ self.render_table(table)
+ print()
+
+ def render_summary(self, summary):
+ self.render_header("Benchmark Summary", level=2)
+ self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.")
+
+ if not summary:
+ self.render_paragraph(
+ "No significant difference in performance between versions."
+ )
+ return
+
+ self.render_paragraph(
+ "The following scenarios show a statistically significant difference "
+ "in performance between the two versions."
+ )
+
+ self.render_table(
+ [
+ (
+ title,
+ {
+ m: {1: self.BETTER, -1: self.WORSE}[s] if c else self.SAME
+ for m, c, s in tests
+ },
+ )
+ for title, tests in summary
+ ]
+ )
+
+
+class TerminalRenderer(Renderer):
+ def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
+ _, row = table[0]
+ cols = list(row.keys())
+ max_vh = max(len(e[0]) for e in table)
+
+ col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
+ div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+
+ print("=" * div_len)
+ print(
+ (" " * (max_vh + 2))
+ + "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
+ )
+ print("-" * div_len)
+
+ for v, row in table:
+ print(f"{v:^{max_vh+2}}", end="")
+ for col, cw in zip(cols, col_widths):
+ print(f"{str(row[col]):^{cw+2}}", end="")
+ print()
+
+ print("=" * div_len)
+
+ def render_header(self, title: str, level: int = 1) -> str:
+ print(title)
+ print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title))
+ print()
+
+ def render_paragraph(self, text: str) -> str:
+ for _ in wrap(text):
+ print(_)
+ print()
+
+
+class MarkdownRenderer(Renderer):
+ BETTER = ":green_circle:"
+ WORSE = ":red_circle:"
+ SAME = ":yellow_circle:"
+
+ def render_header(self, title: str, level: int = 1) -> str:
+ print(f"{'#' * level} {title}")
+ print()
+
+ def render_paragraph(self, text: str) -> str:
+ print(text)
+ print()
+
+ def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
+ _, row = table[0]
+ cols = list(row.keys())
+ max_vh = max(len(e[0]) for e in table)
+
+ col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
+ div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
+
+ print("| |" + "|".join(f" {col} " for col in cols) + "|")
+ print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|")
+
+ for v, row in table:
+ print(
+ f"| {v} |"
+ + "|".join(
+ f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths)
+ )
+ + "|"
+ )
+
+ def render_scenario(
+ self, title, table: t.List[t.Tuple[str, t.List[Results]]]
+ ) -> str:
+ print("<details>")
+ print(f"<summary><strong>{title}</strong></summary>")
+ print()
+ super().render_scenario(title, table)
+ print("</details>")
  print()
 
- print("=" * div_len)
+
+def summarize(results: t.List[t.Tuple[str, t.List[Results]]]):
+ summary = []
+ for title, table in results:
+ (_, a), (_, b) = table[-2:]
+ tests = [
+ (
+ m,
+ a[m] == b[m],
+ int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)),
+ )
+ for m, s in METRICS
+ ]
+ if any(c for _, c, _ in tests):
+ summary.append((title, tests))
+ return summary
 
 
 def main():
@@ -181,26 +339,41 @@ def main():
  help="Number of times to run each scenario",
  )
 
+ argp.add_argument(
+ "-f",
+ "--format",
+ type=str,
+ choices=["terminal", "markdown"],
+ default="terminal",
+ help="The output format",
+ )
+
  opts = argp.parse_args()
 
- print(
+ renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[
+ opts.format
+ ]()
+
+ renderer.render_header("Austin Benchmarks")
+ renderer.render_paragraph(
  f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}",
- end="\n\n",
  )
 
+ results: t.List[t.Tuple[str, t.List[Results]]] = []
+
  for variant, title, args in SCENARIOS:
  if opts.k is not None and not opts.k.match(title):
  continue
 
- print(title)
-
- table = []
+ table: t.List[Results] = []
  for version in VERSIONS:
- print(f"> Running with Austin {version} ... ", end="\r")
+ print(f"> Running with Austin {version} ... ", end="\r", file=sys.stderr)
  try:
  austin = download_release(version, Path("/tmp"), variant_name=variant)
  except RuntimeError:
- print(f"WARNING: Could not download {variant} {version}")
+ print(
+ f"WARNING: Could not download {variant} {version}", file=sys.stderr
+ )
  continue
 
  stats = [
@@ -218,8 +391,13 @@ def main():
  )
  )
 
- render(table)
- print()
+ renderer.render_scenario(title, table)
+
+ results.append((title, table))
+
+ summary = summarize(results)
+
+ renderer.render_summary(summary)
 
 
 if __name__ == "__main__":

diff --git a/scripts/requirements-bm.txt b/scripts/requirements-bm.txt
@@ -1 +1,2 @@
 austin-python~=1.4.1
+scipy~=1.10.1
diff --git a/test/utils.py b/test/utils.py
@@ -170,7 +170,7 @@ def __call__(
  # or using the "where" option.
  result.stdout = demojo(result.stdout)
  else:
- result.stdout = result.stdout.decode()
+ result.stdout = result.stdout.decode(errors="ignore")
  result.stderr = result.stderr.decode()
 
  return result