Skip to content

Commit

Permalink
Update benchmarks
Browse files Browse the repository at this point in the history
- Update benchmarks generally
- Benchmark against Pydantic V2 instead of V1
- Adds a few additional JSON and MessagePack libraries
- Documents the versions of libraries used for each run
- Bumps the python version used from 3.9 to 3.11. This made several of
  the pure-python libraries compared measurably faster. Yay for the
  faster CPython initiative.
  • Loading branch information
jcrist committed Dec 13, 2023
1 parent dceeec3 commit f71d96f
Show file tree
Hide file tree
Showing 19 changed files with 872 additions and 510 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ support for [JSON](https://json.org), [MessagePack](https://msgpack.org),

- 🔍 **Zero-cost schema validation** using familiar Python type annotations. In
[benchmarks](https://jcristharif.com/msgspec/benchmarks.html) `msgspec`
decodes *and* validates JSON ~2x faster than
decodes *and* validates JSON faster than
[orjson](https://github.com/ijl/orjson) can decode it alone.

-**A speedy Struct type** for representing structured data. If you already
use [dataclasses](https://docs.python.org/3/library/dataclasses.html) or
[attrs](https://www.attrs.org),
[structs](https://jcristharif.com/msgspec/structs.html) should feel familiar.
However, they're
[10-100x faster](https://jcristharif.com/msgspec/benchmarks.html#benchmark-structs>)
[5-60x faster](https://jcristharif.com/msgspec/benchmarks.html#benchmark-structs>)
for common operations.

All of this is included in a
Expand Down
Empty file added benchmarks/__init__.py
Empty file.
232 changes: 140 additions & 92 deletions benchmarks/bench_encodings.py
Original file line number Diff line number Diff line change
@@ -1,149 +1,197 @@
from __future__ import annotations

import sys
import dataclasses
import json
import timeit
from typing import List, Union
import importlib.metadata
from typing import Any, Literal, Callable

import msgpack
import orjson
import ujson
from generate_data import make_filesystem_data
from .generate_data import make_filesystem_data

import msgspec


class File(msgspec.Struct, tag="file"):
class File(msgspec.Struct, kw_only=True, omit_defaults=True, tag="file"):
name: str
created_by: str
created_at: str
updated_at: str
updated_by: str | None = None
updated_at: str | None = None
nbytes: int
permissions: Literal["READ", "WRITE", "READ_WRITE"]


class Directory(msgspec.Struct, tag="directory"):
class Directory(msgspec.Struct, kw_only=True, omit_defaults=True, tag="directory"):
name: str
created_by: str
created_at: str
updated_at: str
contents: List[Union[File, Directory]]
updated_by: str | None = None
updated_at: str | None = None
contents: list[File | Directory]


def bench(dumps, loads, ndata, schema=None):
data = make_filesystem_data(ndata)
if schema:
data = msgspec.convert(data, schema)
timer = timeit.Timer("func(data)", globals={"func": dumps, "data": data})
n, t = timer.autorange()
dumps_time = t / n
@dataclasses.dataclass
class Benchmark:
label: str
version: str
encode: Callable
decode: Callable
schema: Any = None

data = dumps(data)
def run(self, data: bytes) -> dict:
if self.schema is not None:
data = msgspec.convert(data, self.schema)
timer = timeit.Timer("func(data)", globals={"func": self.encode, "data": data})
n, t = timer.autorange()
encode_time = t / n

timer = timeit.Timer("func(data)", globals={"func": loads, "data": data})
n, t = timer.autorange()
loads_time = t / n
return dumps_time, loads_time
data = self.encode(data)

timer = timeit.Timer("func(data)", globals={"func": self.decode, "data": data})
n, t = timer.autorange()
decode_time = t / n

def bench_msgspec_msgpack(n):
schema = File if n == 1 else Directory
enc = msgspec.msgpack.Encoder()
dec = msgspec.msgpack.Decoder(schema)
return bench(enc.encode, dec.decode, n, schema)


def bench_msgspec_json(n):
schema = File if n == 1 else Directory
enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(schema)
return bench(enc.encode, dec.decode, n, schema)


def bench_msgpack(n):
packer = msgpack.Packer()
loads = msgpack.loads
return bench(packer.pack, loads, n)
return {
"label": self.label,
"encode": encode_time,
"decode": decode_time,
}


def bench_ujson(n):
return bench(ujson.dumps, ujson.loads, n)
def json_benchmarks():
import orjson
import ujson
import rapidjson
import simdjson

simdjson_ver = importlib.metadata.version("pysimdjson")

def bench_orjson(n):
return bench(orjson.dumps, orjson.loads, n)
rj_dumps = rapidjson.Encoder()
rj_loads = rapidjson.Decoder()

def uj_dumps(obj):
return ujson.dumps(obj)

BENCHMARKS = [
("ujson", bench_ujson),
("orjson", bench_orjson),
("msgpack", bench_msgpack),
("msgspec msgpack", bench_msgspec_msgpack),
("msgspec json", bench_msgspec_json),
]

enc = msgspec.json.Encoder()
dec = msgspec.json.Decoder(Directory)
dec2 = msgspec.json.Decoder()

def run(n, quiet=False):
if quiet:
return [
Benchmark("msgspec structs", None, enc.encode, dec.decode, Directory),
Benchmark("msgspec", msgspec.__version__, enc.encode, dec2.decode),
Benchmark("json", None, json.dumps, json.loads),
Benchmark("orjson", orjson.__version__, orjson.dumps, orjson.loads),
Benchmark("ujson", ujson.__version__, uj_dumps, ujson.loads),
Benchmark("rapidjson", rapidjson.__version__, rj_dumps, rj_loads),
Benchmark("simdjson", simdjson_ver, simdjson.dumps, simdjson.loads),
]

def log(x):
pass

else:
log = print
def msgpack_benchmarks():
import msgpack
import ormsgpack

title = f"Benchmark - {n} object{'s' if n > 1 else ''}"
log(title)
enc = msgspec.msgpack.Encoder()
dec = msgspec.msgpack.Decoder(Directory)
dec2 = msgspec.msgpack.Decoder()

results = []
for name, func in BENCHMARKS:
log(name)
dumps_time, loads_time = func(n)
log(f" dumps: {dumps_time * 1e6:.2f} us")
log(f" loads: {loads_time * 1e6:.2f} us")
log(f" total: {(dumps_time + loads_time) * 1e6:.2f} us")
results.append((name, dumps_time, loads_time))
return results
return [
Benchmark("msgspec structs", None, enc.encode, dec.decode, Directory),
Benchmark("msgspec", msgspec.__version__, enc.encode, dec2.decode),
Benchmark("msgpack", msgpack.__version__, msgpack.dumps, msgpack.loads),
Benchmark(
"ormsgpack", ormsgpack.__version__, ormsgpack.packb, ormsgpack.unpackb
),
]


def main():
import argparse

bench_names = ["1", "1k"]

parser = argparse.ArgumentParser(
description="Benchmark different python serializers"
description="Benchmark different python serialization libraries"
)
parser.add_argument(
"--benchmark",
"-b",
action="append",
choices=["all", *bench_names],
default=[],
help="which benchmark(s) to run, defaults to 'all'",
"--versions",
action="store_true",
help="Output library version info, and exit immediately",
)
parser.add_argument(
"--json",
action="store_true",
help="whether to output the results as json",
"-n",
type=int,
help="The number of objects in the generated data, defaults to 1000",
default=1000,
)
parser.add_argument(
"--no-gc",
"-p",
"--protocol",
choices=["json", "msgpack"],
default="json",
help="The protocol to benchmark, defaults to JSON",
)
parser.add_argument(
"--json",
action="store_true",
help="whether to disable the gc during benchmarking",
help="whether to output the results as json",
)
args = parser.parse_args()

if "all" in args.benchmark or not args.benchmark:
to_run = bench_names
else:
to_run = sorted(set(args.benchmark))
benchmarks = json_benchmarks() if args.protocol == "json" else msgpack_benchmarks()

if args.versions:
for bench in benchmarks:
if bench.version is not None:
print(f"- {bench.label}: {bench.version}")
sys.exit(0)

results = {}
for bench in to_run:
n = 1000 if bench.startswith("1k") else 1
results[bench] = run(n, quiet=args.json)
data = make_filesystem_data(args.n)

results = [benchmark.run(data) for benchmark in benchmarks]

if args.json:
print(json.dumps(results))
for line in results:
print(json.dumps(line))
else:
# Compose the results table
results.sort(key=lambda row: row["encode"] + row["decode"])
best_et = results[0]["encode"]
best_dt = results[0]["decode"]
best_tt = best_et + best_dt

columns = (
"",
"encode (μs)",
"vs.",
"decode (μs)",
"vs.",
"total (μs)",
"vs.",
)
rows = [
(
r["label"],
f"{1_000_000 * r['encode']:.1f}",
f"{r['encode'] / best_et:.1f}",
f"{1_000_000 * r['decode']:.1f}",
f"{r['decode'] / best_dt:.1f}",
f"{1_000_000 * (r['encode'] + r['decode']):.1f}",
f"{(r['encode'] + r['decode']) / best_tt:.1f}",
)
for r in results
]
widths = tuple(
max(max(map(len, x)), len(c)) for x, c in zip(zip(*rows), columns)
)
row_template = ("|" + (" %%-%ds |" * len(columns))) % widths
header = row_template % tuple(columns)
bar_underline = "+%s+" % "+".join("=" * (w + 2) for w in widths)
bar = "+%s+" % "+".join("-" * (w + 2) for w in widths)
parts = [bar, header, bar_underline]
for r in rows:
parts.append(row_template % r)
parts.append(bar)
print("\n".join(parts))


if __name__ == "__main__":
Expand Down
41 changes: 34 additions & 7 deletions benchmarks/bench_memory.py → benchmarks/bench_large_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@
decode = orjson.loads
"""

RAPIDJSON = """
import rapidjson
decode = rapidjson.loads
"""

SIMDJSON = """
import simdjson
decode = simdjson.loads
Expand Down Expand Up @@ -81,15 +86,37 @@ class RepoData(msgspec.Struct, gc=False):


def main():
import argparse

parser = argparse.ArgumentParser(
description="Benchmark decoding a large JSON message using various JSON libraries"
)
parser.add_argument(
"--versions",
action="store_true",
help="Output library version info, and exit immediately",
)
args = parser.parse_args()

benchmarks = [
("json", JSON),
("ujson", UJSON),
("orjson", ORJSON),
("simdjson", SIMDJSON),
("msgspec", MSGSPEC),
("msgspec structs", MSGSPEC_STRUCTS),
("json", None, JSON),
("ujson", "ujson", UJSON),
("orjson", "orjson", ORJSON),
("rapidjson", "python-rapidjson", RAPIDJSON),
("simdjson", "pysimdjson", SIMDJSON),
("msgspec", "msgspec", MSGSPEC),
("msgspec structs", None, MSGSPEC_STRUCTS),
]

if args.versions:
import importlib.metadata

for _, lib, _ in benchmarks:
if lib is not None:
version = importlib.metadata.version(lib)
print(f"- {lib}: {version}")
sys.exit(0)

with tempfile.NamedTemporaryFile() as f:
# Download the repodata.json
resp = requests.get(
Expand All @@ -102,7 +129,7 @@ def main():
results = {}
import ast

for lib, setup in benchmarks:
for lib, _, setup in benchmarks:
script = TEMPLATE.format(path=f.name, setup=setup)
# We execute each script in a subprocess to isolate their memory usage
output = subprocess.check_output([sys.executable, "-c", script])
Expand Down
Loading

0 comments on commit f71d96f

Please sign in to comment.