From cf42a0e14bd936258096df5dd5ab71987bb54e65 Mon Sep 17 00:00:00 2001 From: Craig de Stigter Date: Fri, 29 Nov 2024 17:17:00 +1300 Subject: [PATCH] Use msgspec for JSON output instead of orjson Slightly faster, and less dependencies --- CHANGELOG.md | 5 +++-- kart/json_diff_writers.py | 18 +++++++++--------- kart/output_util.py | 20 +++++++------------- requirements/requirements.in | 1 - requirements/requirements.txt | 2 -- 5 files changed, 19 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 846cfdc8..21ca7e06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,13 +7,14 @@ _When adding new entries to the changelog, please include issue/PR numbers where ## Unreleased +- diff: Use [orjson](https://github.com/ijl/orjson?tab=readme-ov-file#orjson) for faster JSON-Lines output. [#1019](https://github.com/koordinates/kart/pull/1019) +- Much faster access to tabular/vector datasets (about 75% more features processed per second) by switching to [msgspec](https://jcristharif.com/msgspec/) - [#1025](https://github.com/koordinates/kart/pull/1025) +- diff: Faster JSON-Lines output (also using msgspec) - Linux builds now require glibc 2.28+ [#1027](https://github.com/koordinates/kart/pull/1027) - This means minimum distro versions are: - Debian 10+ - Ubuntu 18.10+ - Fedora 29+ - CentOS/RHEL 8+ -- Much faster access to tabular/vector datasets (about 75% more features processed per second) by switching to [msgspec](https://jcristharif.com/msgspec/) - [#1025](https://github.com/koordinates/kart/pull/1025) -- diff: Faster JSON-Lines output (also using msgspec) - Upgrade to PDAL 2.7 [#1005](https://github.com/koordinates/kart/pull/1005) - Adds a `--drop-empty-geometry-features` option to `kart export`. [#1007](https://github.com/koordinates/kart/pull/1007) - Adds diagnostic output to Kart when `KART_DIAGNOSTICS=1` environment variable is set. [#1013](https://github.com/koordinates/kart/pull/1013) diff --git a/kart/json_diff_writers.py b/kart/json_diff_writers.py index b492069d..577395e0 100644 --- a/kart/json_diff_writers.py +++ b/kart/json_diff_writers.py @@ -1,4 +1,3 @@ -import orjson import logging import threading from datetime import datetime, timedelta, timezone @@ -22,8 +21,9 @@ from kart.output_util import ( dump_json_output, resolve_output_path, - orjson_encode_default, + msgspec_json_encoder, ) + from kart.tabular.feature_output import feature_as_geojson, feature_as_json from kart.timestamps import datetime_to_iso8601_utc, timedelta_to_iso8601_tz @@ -239,19 +239,19 @@ def _check_output_path(cls, repo, output_path): def __init__(self, *args, diff_estimate_accuracy=None, delta_filter=None, **kwargs): super().__init__(*args, **kwargs) self.fp = resolve_output_path(self.output_path) - self.separators = (",", ":") if self.json_style == "extracompact" else None + self._diff_estimate_accuracy = diff_estimate_accuracy self.delta_filter = delta_filter self._output_lock = threading.RLock() + # https://jcristharif.com/msgspec/perf-tips.html#reusing-an-output-buffer + self._output_buffer = bytearray() def dump(self, obj): - output: bytes = orjson.dumps( - obj, - default=orjson_encode_default, - option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_NON_STR_KEYS, - ) + # https://jcristharif.com/msgspec/perf-tips.html#line-delimited-json + msgspec_json_encoder.encode_into(obj, self._output_buffer) + self._output_buffer.extend(b"\n") with self._output_lock: - self.fp.buffer.write(output) + self.fp.buffer.write(self._output_buffer) def write_header(self): self.dump( diff --git a/kart/output_util.py b/kart/output_util.py index b46b2f22..006031f8 100644 --- a/kart/output_util.py +++ b/kart/output_util.py @@ -8,7 +8,7 @@ import types from pathlib import Path -import orjson +import msgspec.json import pygments from pygments.lexers import JsonLexer @@ -16,15 +16,6 @@ _terminal_formatter = None -# note: `json` and `orjson` libraries aren't quite interchangeable. -# * orjson is much faster, so we use it where we can -# * orjson doesn't support custom separators -# * orjson doesn't support iterencode(), so can't stream unbounded iterators to stdout :( -ORJSON_OPTIONS = { - "compact": 0, # orjson doesn't support custom separators, so extracompact and compact look identical - "extracompact": 0, - "pretty": orjson.OPT_INDENT_2, -} JSON_PARAMS = { "compact": {}, "extracompact": {"separators": (",", ":")}, @@ -47,9 +38,9 @@ def __iter__(self): return itertools.chain(self._head, *self[:1]) -def orjson_encode_default(obj): +def msgspec_json_encode_default(obj): """ - Hook to extend the default serialisation of `orjson.dumps()` + Hook to extend the default serialisation of `msgspec.json.dumps()` """ if isinstance(obj, tuple): return list(obj) @@ -57,7 +48,10 @@ def orjson_encode_default(obj): if hasattr(obj, "__json__"): return obj.__json__() - raise TypeError + raise NotImplementedError + + +msgspec_json_encoder = msgspec.json.Encoder(enc_hook=msgspec_json_encode_default) class ExtendedJsonEncoder(json.JSONEncoder): diff --git a/requirements/requirements.in b/requirements/requirements.in index d39df898..348fba13 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -3,7 +3,6 @@ certifi click~=8.1.7 docutils<0.18 msgspec~=0.18.6 -orjson Pygments pymysql rst2txt diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 52536fe8..6d4684a6 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -38,8 +38,6 @@ jsonschema==4.17.3 # via -r requirements.in msgspec==0.18.6 # via -r requirements.in -orjson==3.10.11 - # via -r requirements.in #psycopg2==2.9.9 # via -r requirements/vendor-wheels.txt pycparser==2.21