diff --git a/conda_index/json2jlap.py b/conda_index/json2jlap.py new file mode 100644 index 0000000..6eb4503 --- /dev/null +++ b/conda_index/json2jlap.py @@ -0,0 +1,207 @@ +""" +json to jlap "*/repodata.json" -> "*/repodata.jlap tool. + +Copy */repodata.json to */.cache/repodata.json.last + +Read */repodata.jlap + +Diff */repodata.json with */.cache/repodata.json + +Write */repodata.jlap + +Same for current_repodata.jlap + +If output jlap is larger than a set size, remove older diffs. +""" + +from __future__ import annotations + +import itertools +import json +import logging +import shutil +from hashlib import blake2b +from io import IOBase +from pathlib import Path + +import click +import jsonpatch +from conda.gateways.repodata.jlap.core import DEFAULT_IV, DIGEST_SIZE, JLAP + +log = logging.getLogger("__name__") + +# attempt to control individual patch size (will fall back to re-downloading +# repodata.json) without serializing to bytes just to measure +PATCH_STEPS_LIMIT = 8192 + + +def hfunc(data: bytes): + return blake2b(data, digest_size=DIGEST_SIZE) + + +class HashReader: + """ + Hash a file while it is being read. + """ + + def __init__(self, fp: IOBase): + self.fp = fp + self.hash = blake2b(digest_size=DIGEST_SIZE) + + def read(self, bytes=None): + data = self.fp.read(bytes) + self.hash.update(data) + return data + + +def hash_and_load(path): + with path.open("rb") as fp: + h = HashReader(fp) + obj = json.load(h) + return obj, h.hash.digest() + + +def json2jlap_one(cache: Path, repodata: Path, trim_high=0, trim_low=0): + """ + Update jlap patchset for a single json file. + + If trim_high and trim_low are given, the file will be shortened to less than + trim_low bytes once it exceeds trim_high bytes. + """ + previous_repodata = cache / (repodata.name + ".last") + + jlapfile = (repodata.parent / repodata.name).with_suffix(".jlap") + if jlapfile.exists(): + patchfile = JLAP.from_path(jlapfile) + # omit final metadata, checksum lines + patches = patchfile[:-2] + else: + patches = JLAP.from_lines( + [DEFAULT_IV.hex().encode("utf-8")], iv=DEFAULT_IV, verify=False + ) + + repodata_stat = repodata.stat() + if previous_repodata.exists(): + previous_repodata_stat = previous_repodata.stat() + + if previous_repodata.exists() and ( + repodata_stat.st_mtime_ns > previous_repodata_stat.st_mtime_ns + or repodata_stat.st_size != previous_repodata_stat.st_size + ): + current, current_digest = hash_and_load(repodata) + previous, previous_digest = hash_and_load(previous_repodata) + + jpatch = jsonpatch.make_patch(previous, current) + + # inconvenient to add bytes size limit here; limit number of steps? + if previous_digest == current_digest: + log.warn("Skip identical %s", repodata) + elif len(jpatch.patch) > PATCH_STEPS_LIMIT: + log.warn("Skip large %s-step patch", len(jpatch.patch)) + else: + patches.add( + json.dumps( + { + "to": current_digest.hex(), + "from": previous_digest.hex(), + "patch": jpatch.patch, + }, + sort_keys=True, + separators=(",", ":"), + ) + ) + + # metadata + patches.add( + json.dumps( + {"url": repodata.name, "latest": current_digest.hex()}, + sort_keys=True, + separators=(",", ":"), + ) + ) + + patches.terminate() + + if trim_high > trim_low and patches[-1][0] > trim_high: + patches = trim(patches, trim_low) + + patches.write(jlapfile) + + if not previous_repodata.exists() or ( + repodata_stat.st_mtime_ns > previous_repodata_stat.st_mtime_ns + or repodata_stat.st_size != previous_repodata_stat.st_size + ): + shutil.copyfile(repodata, previous_repodata) + + +def trim(jlap: JLAP, target_size: int) -> JLAP: + """ + Remove leading lines from jlap until it is below target_size, including a + new first line with correct iv. If target_size is impractically small, + return the last payload line, in other words the footer plus leading and + trailing checksums. + + Input jlap must have at least 3 lines. + """ + end_position = jlap[-1][0] + + if end_position <= target_size: + return jlap + + limit_position = end_position - target_size + + trimmed = JLAP([element for element in jlap if element[0] >= limit_position]) + + # avoid writing JLAP with just leading, trailing checksums when target_size is too small + if len(trimmed) < 3: + trimmed = JLAP(jlap[-3:]) + + # replace first line with iv for second line. + # breaks if buffer is empty... + trimmed[0] = (0, trimmed[0][2], trimmed[0][2]) + + return trimmed + + +@click.command() +@click.option("--cache", required=True, help="Cache directory.") +@click.option("--repodata", required=True, help="Repodata directory.") +@click.option( + "--trim-low", + required=False, + default=2**20 * 3, + show_default=True, + help="Maximum size after trim.", +) +@click.option( + "--trim-high", + required=False, + default=0, + show_default=True, + help="Trim if larger than size; 0 to disable.", +) +def json2jlap(cache, repodata, trim_high, trim_low): + cache = Path(cache).expanduser() + repodata = Path(repodata).expanduser() + repodatas = itertools.chain( + repodata.glob("*/repodata.json"), repodata.glob("*/current_repodata.json") + ) + for repodata in repodatas: + # require conda-index's .cache folder + cachedir = Path(cache, repodata.parent.name, ".cache") + if not cachedir.is_dir(): + continue + json2jlap_one(cachedir, repodata, trim_high, trim_low) + + +def go(): + logging.basicConfig( + format="%(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", + level=logging.INFO, + ) + json2jlap() + + +if __name__ == "__main__": + go() diff --git a/news/125-jlap b/news/125-jlap new file mode 100644 index 0000000..bb288c3 --- /dev/null +++ b/news/125-jlap @@ -0,0 +1,21 @@ +### Enhancements + +* Add experimental `python -m conda_index.json2jlap` script to run after + indexing, to create `repodata.jlap` patch sets for incremental repodata + downloads. (#125) + +### Bug fixes + +* + +### Deprecations + +* + +### Docs + +* + +### Other + +* diff --git a/tests/test_json2jlap.py b/tests/test_json2jlap.py new file mode 100644 index 0000000..3c35884 --- /dev/null +++ b/tests/test_json2jlap.py @@ -0,0 +1,86 @@ +""" +Test companion json2jlap script, that compares repodata.json with +.cache/repodata.json.last and generates companion patchsets repodata.jlap. This +is generic on json. +""" + +import json + +import pytest +from conda.gateways.repodata.jlap.core import DEFAULT_IV, JLAP + +from conda_index.json2jlap import json2jlap_one + + +def test_json2jlap(tmp_path): + """ + Test basic operation of the patch generator. + """ + cache_dir = tmp_path / "subdir" / ".cache" + repodata = tmp_path / "subdir" / "repodata.json" + jlap_path = repodata.with_suffix(".jlap") + cache_dir.mkdir(parents=True) + for n in range(4): + # change size to avoid testing filesystem timestamp resolution + repodata.write_text(json.dumps({"n": "n" * n})) + + json2jlap_one(cache_dir, repodata) + + assert jlap_path.exists() + jlap = JLAP.from_path(jlap_path) + lines = len(jlap) + for i, (_, b, _) in enumerate(jlap): + if i == 0: + assert b == DEFAULT_IV.hex() + elif i == lines - 1: + assert len(b) == 64 + assert int(b, 16) # should succeed + else: + json.loads(b) # should succeed + + +@pytest.mark.parametrize("trim_high,trim_low", [[1500, 100], [8192, 1024]]) +def test_json2jlap_trim(tmp_path, trim_high, trim_low): + """ + Test that we can correctly trim jlap when they become too large, so that the + patchset is still more convenient than re-dowloading the complete file. + + Test against unreasonably small sizes to make sure we don't produce + degenerate output, and against at-least-preserves-a-few-lines-of-patches + sizes. + + In practice we've chosen low and high values of 3MB / 10MB. + """ + cache_dir = tmp_path / "subdir" / ".cache" + repodata = tmp_path / "subdir" / "repodata.json" + jlap_path = repodata.with_suffix(".jlap") + cache_dir.mkdir(parents=True) + + text = "spam" * 32 + grows = {} + jlap_sizes = [] + for n in range(64): + grows[f"n{n}"] = text + repodata.write_text(json.dumps(grows)) + + # this will cause it to be trimmed with checksums only, no footer + json2jlap_one(cache_dir, repodata, trim_high=trim_high, trim_low=trim_low) + + try: + jlap_sizes.append(jlap_path.stat().st_size) + except FileNotFoundError: + assert n == 0 + + assert jlap_path.exists() + jlap = JLAP.from_path(jlap_path) + lines = len(jlap) + for i, (_, b, _) in enumerate(jlap): + if i in (0, lines - 1): + # trimmed jlap no longer starts with 0's, instead, starts with an + # intermediate hash of the longer jlap that we now only see parts + # of. + assert b != DEFAULT_IV.hex() + assert len(b) == 64 + assert int(b, 16) # should succeed + else: + json.loads(b) # should succeed