Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci(benchmarks): improve benchmark reporting #174

Merged
merged 1 commit into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,9 @@ jobs:
ulimit -c unlimited

source .venv/bin/activate
python scripts/benchmark.py | tee benchmarks.txt
python scripts/benchmark.py --format markdown | tee comment.txt
deactivate

# Make it a code comment
sed -e $'1i\\\n~~~' -e $'$a\\\n~~~' benchmarks.txt > comment.txt

- name: Post results on PR
uses: marocchino/sticky-pull-request-comment@v2
with:
Expand Down
232 changes: 205 additions & 27 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
# Run as python3 scripts/benchmark.py from the repository root directory.
# Ensure dependencies from requirements-bm.txt are installed.

import abc
import re
import sys
from textwrap import wrap
import typing as t
from argparse import ArgumentParser
from itertools import product
from math import floor, log
from pathlib import Path

from scipy.stats import ttest_ind

sys.path.insert(0, str(Path(__file__).parent.parent))

import tarfile
Expand All @@ -17,7 +21,7 @@
from urllib.error import HTTPError
from urllib.request import urlopen

VERSIONS = ("3.2.0", "3.3.0", "3.4.1", "dev")
VERSIONS = ("3.4.1", "3.5.0", "dev")
SCENARIOS = [
*[
(
Expand Down Expand Up @@ -62,6 +66,15 @@
]


# The metrics we evaluate and whether they are to be maximised or minimised.
METRICS = [
("Sample Rate", +1),
("Saturation", -1),
("Error Rate", -1),
("Sampling Speed", -1),
]


def get_stats(output: str) -> t.Optional[dict]:
try:
meta = metadata(output)
Expand Down Expand Up @@ -120,6 +133,7 @@ def download_release(version: str, dest: Path, variant_name: str = "austin") ->

class Outcome:
def __init__(self, data: list[float]) -> None:
self.data = data
self.mean = sum(data) / len(data)
self.stdev = (
sum(((v - self.mean) ** 2 for v in data)) / (len(data) - 1)
Expand All @@ -140,29 +154,173 @@ def __repr__(self):
def __len__(self):
return len(repr(self))

def __eq__(self, other: "Outcome") -> bool:
t, p = ttest_ind(self.data, other.data, equal_var=False)
return p < 0.05

def render(table):
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh
Results = t.Tuple[str, t.Dict[str, Outcome]]

print("=" * div_len)
print(
(" " * (max_vh + 2))
+ "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
)
print("-" * div_len)

for v, row in table:
print(f"{v:^{max_vh+2}}", end="")
for col, cw in zip(cols, col_widths):
print(f"{str(row[col]):^{cw+2}}", end="")
class Renderer(abc.ABC):
BETTER = "better"
WORSE = "worse"
SAME = "same"

@abc.abstractmethod
def render_header(self, title: str, level: int = 1) -> str:
...

@abc.abstractmethod
def render_paragraph(self, text: str) -> str:
...

@abc.abstractmethod
def render_table(self, table) -> str:
...

@abc.abstractmethod
def render_scenario(
self, title, results: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
...

@abc.abstractmethod
def render_summary(
self, summary: t.List[t.Tuple[str, t.List[t.Tuple[str, bool, int]]]]
) -> str:
...

def render_scenario(
self, title, table: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
self.render_header(title, level=2)
self.render_table(table)
print()

def render_summary(self, summary):
self.render_header("Benchmark Summary", level=2)
self.render_paragraph(f"Comparison of {VERSIONS[-1]} against {VERSIONS[-2]}.")

if not summary:
self.render_paragraph(
"No significant difference in performance between versions."
)
return

self.render_paragraph(
"The following scenarios show a statistically significant difference "
"in performance between the two versions."
)

self.render_table(
[
(
title,
{
m: {1: self.BETTER, -1: self.WORSE}[s] if c else self.SAME
for m, c, s in tests
},
)
for title, tests in summary
]
)


class TerminalRenderer(Renderer):
def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh

print("=" * div_len)
print(
(" " * (max_vh + 2))
+ "".join(f"{col:^{cw+2}}" for col, cw in zip(cols, col_widths))
)
print("-" * div_len)

for v, row in table:
print(f"{v:^{max_vh+2}}", end="")
for col, cw in zip(cols, col_widths):
print(f"{str(row[col]):^{cw+2}}", end="")
print()

print("=" * div_len)

def render_header(self, title: str, level: int = 1) -> str:
print(title)
print({1: "=", 2: "-", 3: "~"}.get(level, "-") * len(title))
print()

def render_paragraph(self, text: str) -> str:
for _ in wrap(text):
print(_)
print()


class MarkdownRenderer(Renderer):
BETTER = ":green_circle:"
WORSE = ":red_circle:"
SAME = ":yellow_circle:"

def render_header(self, title: str, level: int = 1) -> str:
print(f"{'#' * level} {title}")
print()

def render_paragraph(self, text: str) -> str:
print(text)
print()

def render_table(self, table: t.List[t.Tuple[str, t.List[Results]]]) -> str:
_, row = table[0]
cols = list(row.keys())
max_vh = max(len(e[0]) for e in table)

col_widths = [max(max(len(r[col]), len(col)) for _, r in table) for col in cols]
div_len = sum(col_widths) + (len(cols) + 1) * 2 + max_vh

print("| |" + "|".join(f" {col} " for col in cols) + "|")
print("| --- |" + "|".join(f":{'-' * len(col)}:" for col in cols) + "|")

for v, row in table:
print(
f"| {v} |"
+ "|".join(
f" {str(row[col]):^{cw}} " for col, cw in zip(cols, col_widths)
)
+ "|"
)

def render_scenario(
self, title, table: t.List[t.Tuple[str, t.List[Results]]]
) -> str:
print("<details>")
print(f"<summary><strong>{title}</strong></summary>")
print()
super().render_scenario(title, table)
print("</details>")
print()

print("=" * div_len)

def summarize(results: t.List[t.Tuple[str, t.List[Results]]]):
summary = []
for title, table in results:
(_, a), (_, b) = table[-2:]
tests = [
(
m,
a[m] == b[m],
int((b[m].mean - a[m].mean) * s / (abs(b[m].mean - a[m].mean) or 1)),
)
for m, s in METRICS
]
if any(c for _, c, _ in tests):
summary.append((title, tests))
return summary


def main():
Expand All @@ -181,26 +339,41 @@ def main():
help="Number of times to run each scenario",
)

argp.add_argument(
"-f",
"--format",
type=str,
choices=["terminal", "markdown"],
default="terminal",
help="The output format",
)

opts = argp.parse_args()

print(
renderer = {"terminal": TerminalRenderer, "markdown": MarkdownRenderer}[
opts.format
]()

renderer.render_header("Austin Benchmarks")
renderer.render_paragraph(
f"Running Austin benchmarks with Python {'.'.join(str(_) for _ in sys.version_info[:3])}",
end="\n\n",
)

results: t.List[t.Tuple[str, t.List[Results]]] = []

for variant, title, args in SCENARIOS:
if opts.k is not None and not opts.k.match(title):
continue

print(title)

table = []
table: t.List[Results] = []
for version in VERSIONS:
print(f"> Running with Austin {version} ... ", end="\r")
print(f"> Running with Austin {version} ... ", end="\r", file=sys.stderr)
try:
austin = download_release(version, Path("/tmp"), variant_name=variant)
except RuntimeError:
print(f"WARNING: Could not download {variant} {version}")
print(
f"WARNING: Could not download {variant} {version}", file=sys.stderr
)
continue

stats = [
Expand All @@ -218,8 +391,13 @@ def main():
)
)

render(table)
print()
renderer.render_scenario(title, table)

results.append((title, table))

summary = summarize(results)

renderer.render_summary(summary)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions scripts/requirements-bm.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
austin-python~=1.4.1
scipy~=1.10.1
2 changes: 1 addition & 1 deletion test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def __call__(
# or using the "where" option.
result.stdout = demojo(result.stdout)
else:
result.stdout = result.stdout.decode()
result.stdout = result.stdout.decode(errors="ignore")
result.stderr = result.stderr.decode()

return result
Expand Down