Skip to content

Commit

Permalink
ci(benchmarks): improve benchmark reporting
Browse files Browse the repository at this point in the history
This change improves the report from benchmark runs by adding t-tests
for each scenario. This allows the benchmarks to provide a summary at
the end with the scenarios that are likely to show perfomance difference
between the latest released version and the one that comes with the PR
the benchmarks run for.

This also updates the versions that are being benchmarked to include the
latest releases.
  • Loading branch information
P403n1x87 committed Feb 27, 2023
1 parent 01a247a commit a1b01d9
Show file tree
Hide file tree
Showing 4 changed files with 200 additions and 32 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,9 @@ jobs:
ulimit -c unlimited
source .venv/bin/activate
python scripts/benchmark.py | tee benchmarks.txt
python scripts/benchmark.py --format markdown | tee comment.txt
deactivate
# Make it a code comment
sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt
- name: Post results on PR
uses: marocchino/sticky-pull-request-comment@v2
with:
Expand Down
224 changes: 197 additions & 27 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
# Run as python3 scripts/benchmark.py from the repository root directory.
# Ensure dependencies from requirements-bm.txt are installed.

import abc
import re
import sys
from textwrap import wrap
import typing as t
from argparse import ArgumentParser
from itertools import product
from math import floor, log
from pathlib import Path

from scipy.stats import ttest_ind

sys.path.insert(0, str(Path(__file__).parent.parent))

import tarfile
Expand All @@ -17,7 +21,7 @@
from urllib.error import HTTPError
from urllib.request import urlopen

VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev")
VERSIONS = ("3.5.0", "dev") # ("3.4.1", "3.5.0", "dev")
SCENARIOS = [
*[
(
Expand Down Expand Up @@ -62,6 +66,15 @@
]


# The metrics we evaluate and whether they are to be maximised or minimised.
METRICS = [
("Sample Rate", +1),
("Saturation", -1),
("Error Rate", -1),
("Sampling Speed", -1),
]


def get_stats(output: str) -> t.Optional[dict]:
try:
meta = metadata(output)
Expand Down Expand Up @@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") ->

class Outcome:
def __init__(self, data: list[float]) -> None:
self.data = data
self.mean = sum(data) / len(data)
self.stdev = (
sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1)
Expand All @@ -140,29 +154,165 @@ def __repr__(self):
def __len__(self):
return len(repr(self))

def __eq__(self, other: "Outcome") -> bool:
t, p = ttest_ind(self.data, other.data, equal_var=False)
return p < 0.05

def render(table):
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
Results = t.Tuple[str, t.Dict[str, Outcome]]

print("=" * div_len)
print(
(" " * (max_vh + 2))
+ "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
)
print("-" * div_len)

for v, row in table:
print(f"{v:^{max_vh+2}}", end="")
for col, cw in zip(cols, col_widths):
print(f"{str(row[col]):^{cw+2}}", end="")
class Renderer(abc.ABC):
@abc.abstractmethod
def render_header(self, title: str, level: int = 1) -> str:
...

@abc.abstractmethod
def render_paragraph(self, text: str) -> str:
...

@abc.abstractmethod
def render_table(self, table) -> str:
...

@abc.abstractmethod
def render_scenario(
self, title, results: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
...

@abc.abstractmethod
def render_summary(
self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]]
) -> str:
...

def render_scenario(
self, title, table: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
self.render_header(title, level=2)
self.render_table(table)
print()

print("=" * div_len)
def render_summary(self, summary):
self.render_header("Benchmark Summary", level=2)
self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.")

if not summary:
self.render_paragraph(
"No significant difference in performance between versions."
)
return

self.render_paragraph(
"The following scenarios show a statistically significant difference "
"in performance between the two version."
)

self.render_table(
[
(
title,
{
m: {1: "better", -1: "worse"}[s] if c else "same"
for m, c, s in tests
},
)
for title, tests in summary
]
)


class TerminalRenderer(Renderer):
def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh

print("=" * div_len)
print(
(" " * (max_vh + 2))
+ "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
)
print("-" * div_len)

for v, row in table:
print(f"{v:^{max_vh+2}}", end="")
for col, cw in zip(cols, col_widths):
print(f"{str(row[col]):^{cw+2}}", end="")
print()

print("=" * div_len)

def render_header(self, title: str, level: int = 1) -> str:
print(title)
print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title))
print()

def render_paragraph(self, text: str) -> str:
for _ in wrap(text):
print(_)
print()


class MarkdownRenderer(Renderer):
def render_header(self, title: str, level: int = 1) -> str:
print(f"{'#' * level} {title}")
print()

def render_paragraph(self, text: str) -> str:
for _ in wrap(text):
print(_)
print()

def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh

print("| |" + "|".join(f" {col} " for col in cols) + "|")
print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|")

for v, row in table:
print(
f"| {v} |"
+ "|".join(
f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths)
)
+ "|"
)

def render_scenario(
self, title, table: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
print("<details>")
print(f"<summary><strong>{title}</strong></summary>")
print()
super().render_scenario(title, table)
print("</details>")


def summarize(results: t.List[t.Tuple[str, t.List[Results]]]):
summary = []
for title, table in results:
(_, a), (_, b) = table[-2:]
tests = [
(
m,
a[m] == b[m],
int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)),
)
for m, s in METRICS
]
if any(c for _, c, _ in tests):
summary.append((title, tests))
return summary


def main():
Expand All @@ -181,26 +331,41 @@ def main():
help="Number of times to run each scenario",
)

argp.add_argument(
"-f",
"--format",
type=str,
choices=["terminal", "markdown"],
default="terminal",
help="The output format",
)

opts = argp.parse_args()

print(
renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[
opts.format
]()

renderer.render_header("Austin Benchmarks")
renderer.render_paragraph(
f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}",
end="\n\n",
)

results: t.List[t.Tuple[str, t.List[Results]]] = []

for variant, title, args in SCENARIOS:
if opts.k is not None and not opts.k.match(title):
continue

print(title)

table = []
table: t.List[Results] = []
for version in VERSIONS:
print(f"> Running with Austin {version} ... ", end="\r")
print(f"> Running with Austin {version} ... ", end="\r", file=sys.stderr)
try:
austin = download_release(version, Path("/tmp"), variant_name=variant)
except RuntimeError:
print(f"WARNING: Could not download {variant} {version}")
print(
f"WARNING: Could not download {variant} {version}", file=sys.stderr
)
continue

stats = [
Expand All @@ -218,8 +383,13 @@ def main():
)
)

render(table)
print()
renderer.render_scenario(title, table)

results.append((title, table))

summary = summarize(results)

renderer.render_summary(summary)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions scripts/requirements-bm.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
austin-python~=1.4.1
scipy~=1.10.1
2 changes: 1 addition & 1 deletion test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def __call__(
# or using the "where" option.
result.stdout = demojo(result.stdout)
else:
result.stdout = result.stdout.decode()
result.stdout = result.stdout.decode(errors="ignore")
result.stderr = result.stderr.decode()

return result
Expand Down

0 comments on commit a1b01d9

Please sign in to comment.