From 27663553aea64f03b875d5f379b6664d4c4f6102 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 8 Feb 2024 16:17:16 +0000 Subject: [PATCH] Update documentation --- _sources/spec.rst.txt | 66 +++++++++++++++++++++++++++++++++++++++++++ index.html | 1 + searchindex.js | 2 +- spec.html | 53 ++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 1 deletion(-) diff --git a/_sources/spec.rst.txt b/_sources/spec.rst.txt index 39a87c53..e9e51ffa 100644 --- a/_sources/spec.rst.txt +++ b/_sources/spec.rst.txt @@ -138,3 +138,69 @@ from the given URL, at an offset of 1000. + +Parquet references +------------------ + +Since JSON is rather verbose, it is easy with enough chunks to make a references file +that is too big: slow to load and heavy on memory. Although the former can be +alleviated by compression (I recommend Zstd), the latter cannot. This can +become particularly apparent during the combine phase when loading many reference sets. + +The class `fsspec.implementations.reference.LazyReferenceMapper`_ provides an +alternative _implementation_, and its on-disk layout effectively is a new reference +spec, and we describe it here. The class itself has a dict mapper interface, just +like the rendered references from JSON files; except that it assumes that it is +working on a zarr dataset. This is because the references are split into files, and +an array's shape/chunk information is used to figure out which reference file +to load. + +.. _fsspec.implementations.reference.LazyReferenceMapper: https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=lazyreference#fsspec.implementations.reference.LazyReferenceMapper + +The following code + +.. code-block:: python + + lz = fsspec.implementations.reference.LazyReferenceMapper.create("ref.parquet") + z = zarr.open_group(lz, mode="w") + d = z.create_dataset("name", shape=(1,)) + d[:] = 1 + g2 = z.create_group("deep") + d = g2.create_dataset("name", shape=(1,)) + d[:] = 1 + +produces files + +.. code-block:: text + + ref.parquet/deep/name/refs.0.parq + ref.parquet/name/refs.0.parq + ref.parquet/.zmetadata + +Here, .zmetadata is all of the metadata of all of all subgroups/arrays (similar to +zarr "consolidated metadata", with two top-level fields: "metadata" (dict[str, str] all of the +zarr metadata key/values) and "record_size", an integer set during ``.create()``. + +Each parquet file contains references within the corresponding path to where it is. +For example, key "name/0" will be the zeroth reference in "./name/refs.0.parq". If +there are multiple dimensions, normal C indexing is used to find the Nth reference, +and there are up to "record_size" references (default 10000) in the first file; +reference >10000,<=2000 would be in "./name/refs.2.parquet". Each file is (for now) +padded to record_size, but they compress really well. + +Each row of the parquet data contains fields + +.. code-block:: + + path: optional str/categorical, remote location URL + offset: int, start location of block + size: int, number of bytes in block + raw: optional bytes, binary data + +If ``raw`` is populated, this is the data of the key. If ``path`` is +populated but size is 0, it is the whole file indicated. Otherwise, +it is a byte block in the indicated file. If both ``raw`` and ``path`` +are NULL, the key does not exist. + +We reserve the possibility to store small array data in .zmetadata instead +of creating a small/mostly empty parquet file for each. diff --git a/index.html b/index.html index 09907596..7055548a 100644 --- a/index.html +++ b/index.html @@ -161,6 +161,7 @@

IntroductionReferences specification
  • Beyond Python
  • diff --git a/searchindex.js b/searchindex.js index 4f51b173..9a2a5b5a 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["advanced", "beyond", "cases", "contributing", "detail", "index", "nonzarr", "reference", "spec", "test_example", "tutorial"], "filenames": ["advanced.rst", "beyond.rst", "cases.rst", "contributing.rst", "detail.rst", "index.rst", "nonzarr.rst", "reference.rst", "spec.rst", "test_example.rst", "tutorial.rst"], "titles": ["Advanced Topics", "Beyond Python", "Case studies", "Contributing to kerchunk", "Detailed description", "kerchunk", "Non-zarr uses", "API Reference", "References specification", "Quick Start", "Tutorial"], "terms": {"scan": [0, 4, 6, 9], "combin": [0, 3, 5, 9], "dataset": [0, 1, 2, 3, 4, 5, 6, 7, 9], "can": [0, 1, 3, 4, 5, 6, 7, 9, 10], "computation": 0, "intens": [0, 4], "mai": [0, 3, 6, 7, 8], "requir": [0, 1, 2, 3, 7, 8, 10], "lot": [0, 6], "bandwidth": 0, "some": [0, 1, 4, 5, 6, 7, 9], "data": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10], "format": [0, 1, 2, 4, 5, 6, 8], "where": [0, 3, 4, 5, 6, 7, 8, 9, 10], "target": [0, 1, 7], "contain": [0, 3, 7, 10], "mani": [0, 2, 3, 4, 5, 7], "input": [0, 7, 8, 10], "make": [0, 3, 4, 6], "sens": 0, "parallelis": 0, "job": 0, "mayb": [0, 1, 7], "distribut": [0, 4, 6, 7], "workload": [0, 4], "cluster": [0, 4, 7], "get": [0, 4, 6, 7], "addit": [0, 5, 9], "cpu": [0, 4, 10], "network": [0, 5], "perform": [0, 6, 7], "The": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10], "simplest": [0, 10], "case": [0, 4, 5, 6, 7, 9, 10], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "process": [0, 4, 5, 6, 7, 10], "individu": [0, 7, 9, 10], "let": 0, "": [0, 3, 6, 7, 8, 10], "sai": 0, "you": [0, 1, 3, 4, 5, 7, 9], "have": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], "list": [0, 2, 7, 10], "need": [0, 1, 2, 3, 4, 5, 6, 7, 9], "encapsul": 0, "each": [0, 1, 2, 4, 7, 9, 10], "singl": [0, 2, 4, 5, 7], "function": [0, 3, 7, 10], "In": [0, 3, 4, 5, 6, 9, 10], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], "mode": [0, 10], "typic": [0, 10], "save": [0, 6, 7, 9, 10], "output": [0, 5, 7], "although": [0, 7], "return": [0, 3, 7, 8, 10], "them": [0, 2, 3, 4, 5, 6, 7, 9, 10], "ok": [0, 7], "too": [0, 5], "especi": 0, "mean": [0, 4, 5, 6, 7, 9], "immedi": 0, "here": [0, 2, 3, 6, 8, 10], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], "exampl": [0, 1, 2, 3, 4, 5, 7, 8, 10], "hdf5": [0, 2, 4, 5, 7, 9], "caller": 0, "should": [0, 1, 3, 6, 7, 8, 9, 10], "sure": [0, 3], "option": [0, 7, 8], "ani": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], "paramet": [0, 7], "transform": 0, "ar": [0, 2, 3, 4, 5, 6, 7, 8, 10], "place": [0, 4, 7, 10], "import": [0, 3, 9, 10], "ujson": [0, 7, 10], "fsspec": [0, 2, 3, 5, 7, 8, 9, 10], "def": [0, 10], "url": [0, 1, 4, 7, 8, 9, 10], "outputfil": 0, "storage_options_in": [0, 7], "storage_options_out": [0, 7], "kerchunk": [0, 2, 4, 7, 9], "hdf": [0, 4, 5, 7, 9, 10], "singlehdf5tozarr": [0, 7, 9, 10], "ref": [0, 7, 8, 10], "translat": [0, 5, 7, 9, 10], "open": [0, 3, 7, 9, 10], "wt": 0, "f": [0, 7, 8, 10], "dump": [0, 10], "task": 0, "delai": 0, "u": [0, 2, 4, 8, 9, 10], "o": [0, 10], "zip": [0, 7], "infilenam": 0, "outfilenam": 0, "comput": [0, 5, 10], "itself": 0, "slow": 0, "memori": [0, 2, 5, 7, 9, 10], "hungri": 0, "refer": [0, 1, 2, 3, 4, 5, 9, 10], "set": [0, 1, 7, 8, 9], "batch": [0, 7], "which": [0, 3, 4, 6, 7, 8, 9, 10], "reduc": [0, 7], "redund": 0, "between": [0, 3, 10], "result": [0, 7, 9, 10], "techniqu": [0, 4], "known": [0, 6, 7, 10], "do": [0, 4, 7, 9], "hand": 0, "seen": 0, "we": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], "also": [0, 2, 5, 7, 8, 10], "provid": [0, 5, 7, 8, 9, 10], "auto_dask": [0, 7], "conveni": [0, 10], "one": [0, 3, 4, 5, 6, 7, 8, 9, 10], "stop": [0, 7, 8, 10], "call": [0, 7], "those": [0, 4], "final": [0, 7, 10], "take": [0, 3, 4, 5, 8, 10], "number": [0, 4, 5, 7, 10], "dict": [0, 7, 9, 10], "argument": [0, 7, 9, 10], "user": [0, 3, 5, 7, 9, 10], "consult": 0, "docstr": 0, "specif": [0, 1, 3, 4, 5, 7, 10], "class": [0, 7], "decod": [0, 5, 7, 8], "multizarrtozarr": [0, 7, 9, 10], "note": [0, 2, 7, 8, 9], "preprocess": [0, 7], "befor": [0, 3, 7, 10], "stage": [0, 7], "postprocess": [0, 7], "onli": [0, 2, 3, 4, 5, 6, 7, 10], "after": [0, 3, 4, 7], "It": [0, 1, 3, 5, 7, 8, 10], "often": [0, 5, 6, 7], "wrap": [0, 7], "multipl": [0, 4, 5, 7, 9], "tar": [0, 5, 7], "If": [0, 3, 4, 6, 7], "support": [0, 4, 5, 10], "thei": [0, 3, 4, 6, 7, 10], "directli": [0, 4, 7, 10], "someth": 0, "like": [0, 1, 4, 5, 6, 7, 10], "netcdf3": [0, 7, 10], "netcdf3tozarr": [0, 7], "myfil": 0, "nc": [0, 8, 10], "inline_threshold": [0, 7, 9, 10], "0": [0, 5, 7, 9, 10], "out": [0, 6, 7, 9, 10], "member": [0, 7], "local": [0, 3, 5, 6, 7, 10], "turn": [0, 7], "off": [0, 3], "inlin": [0, 7, 10], "done": [0, 6, 7, 9], "later": 0, "util": [0, 5], "do_inlin": [0, 7], "come": [0, 8], "At": 0, "point": [0, 3, 7, 9], "gener": [0, 2, 7, 9, 10], "problemat": 0, "load": [0, 1, 5, 7, 10], "so": [0, 2, 3, 4, 5, 6, 7, 9, 10], "rang": [0, 5, 7], "origin": [0, 3, 4, 5, 7, 10], "instead": [0, 7, 10], "back": 0, "nomin": 0, "form": [0, 8, 10], "readi": [0, 3], "autom": 0, "step": [0, 3, 7, 8, 9], "futur": [0, 8], "out2": 0, "dereference_arch": [0, 7], "100": [0, 7, 8, 9, 10], "consolid": [0, 5, 7, 9, 10], "now": [0, 3, 5, 10], "all": [0, 2, 4, 5, 6, 7, 9, 10], "For": [0, 3, 4, 5, 6, 7, 8, 10], "uncompress": [0, 7], "access": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10], "wai": [0, 5, 7, 10], "json": [0, 1, 4, 5, 7, 8], "veri": [0, 5, 6], "becaus": [0, 6, 7, 10], "human": 0, "readabl": 0, "ubiquit": 0, "howev": [0, 6], "most": 0, "effici": [0, 4, 5, 6, 7], "term": [0, 6], "size": [0, 1, 2, 4, 7, 8, 10], "pars": [0, 6], "speed": 0, "python": [0, 4, 5, 7, 8, 10], "particular": [0, 1, 4, 6, 10], "ad": [0, 3], "downsid": 0, "repeat": 0, "string": [0, 6, 7, 8, 10], "becom": [0, 7], "separ": [0, 2, 5, 7], "instanc": [0, 4, 6, 7, 8, 10], "greatli": 0, "inflat": 0, "footprint": [0, 7], "time": [0, 2, 3, 4, 7, 9, 10], "To": [0, 1, 3, 5, 7, 10], "overcom": 0, "problem": [0, 5], "keep": [0, 3, 7, 9], "down": 0, "end": [0, 3, 6], "convert": [0, 7], "store": [0, 1, 3, 4, 5, 6, 7, 8, 10], "implement": [0, 1, 7], "referencefilesystem": [0, 4, 8], "altern": [0, 9, 10], "new": [0, 3, 5, 7, 8, 10], "design": [0, 4, 5], "work": [0, 2, 3, 5, 7], "principl": 0, "benefit": 0, "path": [0, 3, 4, 7, 8, 9, 10], "much": [0, 5, 6], "more": [0, 1, 3, 4, 5, 6, 7, 8, 10], "compact": 0, "2x": 0, "smaller": [0, 7, 10], "than": [0, 4, 7, 8], "compress": [0, 4, 5, 6, 10], "10x": 0, "correspondingli": 0, "faster": [0, 10], "instanti": 0, "filesystem": [0, 5, 7, 9, 10], "sinc": [0, 6, 9], "taken": [0, 4, 7, 10], "byte": [0, 4, 5, 7, 8], "e": [0, 3, 4, 5, 7, 8], "g": [0, 4, 5, 7], "int": [0, 7, 8], "28": 0, "arrai": [0, 4, 5, 6, 7, 8, 10], "4": [0, 4, 8, 10], "8": [0, 3, 10], "lazi": [0, 7, 10], "partit": [0, 6], "kei": [0, 1, 7, 8, 10], "variabl": [0, 2, 4, 7, 8], "actual": [0, 6, 7], "dictionari": [0, 7, 10], "encod": [0, 1, 4, 5, 7, 10], "per": [0, 4, 7], "uniqu": [0, 7, 10], "onc": [0, 3, 4, 6, 10], "df": [0, 7], "refs_to_datafram": [0, 7], "exist": [0, 5, 7], "care": 0, "read": [0, 4, 5, 6, 7, 9], "recommend": [0, 7, 9, 10], "understand": [0, 5], "oper": [0, 4, 10], "therefor": 0, "larg": [0, 3, 4, 5, 6, 7], "merg": [0, 3, 7], "abl": [0, 3, 4, 6], "via": [0, 1, 5, 6], "A": [0, 9, 10], "concret": 0, "workflow": [0, 7, 10], "follow": [0, 5, 9, 10], "execut": [0, 7], "first": [0, 3, 4, 6, 7, 9, 10], "three": [0, 8], "go": [0, 5], "avail": [0, 2, 7, 10], "from": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], "lazyreferencemapp": [0, 7], "tempfil": 0, "temporarydirectori": 0, "xarrai": [0, 3, 9, 10], "xr": [0, 9, 10], "location_of_data": 0, "creat": [0, 2, 4, 5, 7, 9, 10], "pass": [0, 4, 7, 8, 10], "makedir": 0, "parq": 0, "record_s": [0, 7], "1000": [0, 7, 8], "root": [0, 7], "single_ref_set": 0, "_": [0, 3, 8, 10], "out_dict": 0, "remote_protocol": [0, 7, 9, 10], "s3": [0, 2, 5, 8, 9, 10], "concat_dim": [0, 7, 9, 10], "remote_opt": [0, 7, 9, 10], "anon": [0, 9, 10], "true": [0, 4, 7, 9, 10], "flush": 0, "target_protocol": [0, 7], "d": [0, 7, 9, 10], "open_dataset": [0, 9, 10], "get_mapp": [0, 10], "engin": [0, 9, 10], "zarr": [0, 1, 5, 7, 8, 9, 10], "backend_kwarg": [0, 9, 10], "fals": [0, 7, 9, 10], "ha": [0, 3, 6, 7, 10], "metadata": [0, 2, 4, 5, 6, 7, 8], "coordin": [0, 5, 7, 10], "main": [0, 2, 3, 7], "correspond": 0, "been": [0, 7, 8, 10], "touch": 0, "even": [0, 3, 6, 9, 10], "500mb": 0, "As": 0, "demand": 0, "cach": [0, 10], "worker": [0, 7], "file": [1, 2, 3, 4, 5, 6, 8], "current": [1, 3, 5, 7, 10], "readili": [1, 4], "interpret": [1, 3, 7, 8], "languag": [1, 4], "document": [1, 2, 5, 6, 10], "detail": [1, 5], "content": [1, 6, 7, 8], "either": [1, 3, 10], "binari": [1, 5, 6, 7, 8, 10], "offset": [1, 6, 8], "long": [1, 3, 6], "being": [1, 3, 7], "us": [1, 2, 3, 4, 5, 7, 8], "type": [1, 3, 4, 5, 7, 10], "block": [1, 4, 5, 6, 7, 10], "part": [1, 3, 4, 6], "well": 1, "whichev": 1, "codec": [1, 4, 5], "noth": 1, "plain": 1, "common": [1, 7], "compressor": [1, 6, 8], "gzip": 1, "might": [1, 4, 8, 9], "would": [1, 7, 9, 10], "write": [1, 4, 7, 9, 10], "code": [1, 2, 5, 9, 10], "expos": [1, 3], "storag": [1, 4, 5, 6, 7, 8, 10], "object": [1, 5, 7, 8, 10], "One": 1, "j": 1, "appli": [1, 5, 7, 10], "multi": [1, 5], "scale": 1, "tiff": [1, 5, 7, 10], "microscopi": 1, "found": [1, 3, 7, 9], "http": [1, 2, 3, 4, 5, 8, 9, 10], "observablehq": 1, "com": [1, 2, 3, 9, 10], "manzt": 1, "om": 1, "filesystemrefer": 1, "complet": [2, 4, 5], "reproduc": 2, "made": [2, 4, 7], "link": [2, 9], "possibli": 2, "notebook": [2, 5], "benchmark": 2, "page": [2, 3, 5], "progress": 2, "repo": [2, 3, 7], "intak": [2, 9, 10], "catalogu": 2, "clean": 2, "up": [2, 4], "standard": [2, 7], "nativ": 2, "geotiff": [2, 7], "effect": [2, 3, 5, 6], "400tb": 2, "1": [2, 5, 7, 10], "earthbigdata": 2, "websit": 2, "west": [2, 10], "2": [2, 7, 8, 10], "amazonaw": 2, "discuss": 2, "github": [2, 3, 9, 10], "issu": [2, 3, 10], "78": 2, "script": [2, 10], "cgohlk": 2, "tifffil": [2, 7], "blob": 2, "v2021": 2, "10": [2, 7, 10], "py": [2, 3], "nbviewer": 2, "org": [2, 3], "ipynb": 2, "fit": [2, 4, 5, 7, 10], "400gb": 2, "wavelength": 2, "filter": [2, 7], "present": [2, 4, 6, 9], "date": [2, 3], "ob": 2, "nearest": 2, "preced": 2, "94a": 2, "imag": [2, 4], "other": [2, 4, 5, 6, 7, 8, 10], "maintain": 2, "axi": [2, 7], "sdo": 2, "netcdf4": [2, 4, 5, 9, 10], "80tb": 2, "dask": [2, 4, 5, 7, 10], "tee": 2, "reduct": 2, "were": [2, 4, 6, 7], "aggreg": [2, 4, 5, 7], "gist": 2, "rsignel": 2, "usg": 2, "ef435a53ac530a2843ce7e1d59f96e22": 2, "02da7d9257b4b26d84d053be1af2ceeb": 2, "66tb": 2, "On": 2, "disk": [2, 4], "16tb": 2, "podaac": 2, "jpl": 2, "nasa": 2, "gov": 2, "l4": 2, "glob": [2, 10], "v4": 2, "cgentemann": 2, "cloud_scienc": 2, "master": [2, 3], "zarr_meta": 2, "cloud_mur_v41_benchmark": 2, "sea": 2, "surfac": 2, "temperatur": 2, "includ": [2, 5, 6, 7, 8, 10], "see": [2, 3, 4, 7, 9], "how": [2, 3, 4, 5, 7, 9], "establish": 2, "earthdata": 2, "credenti": [2, 3, 10], "necessari": [2, 3, 10], "grib2": [2, 5, 7, 10], "5gb": 2, "11": 2, "subset": 2, "rapidrefresh": 2, "noaa": [2, 9], "peterm790": 2, "92eb1df3d58ba41d3411f8a840be2452": 2, "high": 2, "resolut": 2, "rapid": 2, "refresh": 2, "real": [2, 7], "3": [2, 3, 8, 10], "km": 2, "hourli": 2, "updat": [2, 3, 7], "cloud": [2, 4, 5, 9], "resolv": [2, 3], "convect": 2, "allow": [2, 4, 5, 6, 7, 10], "atmospher": 2, "extract": [2, 4, 5, 7, 10], "section": [2, 8], "match": [2, 8, 10], "heightaboveground": [2, 7], "base": [3, 9], "guid": 3, "panda": [3, 7], "xbatcher": 3, "activ": 3, "break": 3, "introduc": 3, "its": [3, 8, 10], "state": 3, "possibl": [3, 4, 5, 6, 8, 10], "experienc": 3, "build": 3, "indic": [3, 6, 8], "todai": 3, "continu": 3, "usual": [3, 7], "don": 3, "t": 3, "instal": [3, 9, 10], "project": [3, 4], "submit": 3, "repositori": 3, "git": [3, 10], "version": [3, 5, 7], "control": 3, "track": 3, "instruct": 3, "setup": 3, "your": 3, "next": [3, 9], "navig": 3, "click": 3, "button": 3, "top": [3, 7], "right": 3, "own": 3, "want": [3, 4, 7], "clone": 3, "onto": 3, "machin": [3, 4, 10], "termin": [3, 6], "command": [3, 9], "prompt": 3, "yourusernam": 3, "cd": 3, "remot": [3, 7, 9, 10], "add": [3, 7, 8], "upstream": 3, "directori": [3, 4, 6, 7], "connect": 3, "sourc": [3, 5, 7, 10], "ensur": 3, "same": [3, 4, 6, 7, 10], "everyon": 3, "els": 3, "virtual": [3, 5, 6, 9, 10], "isol": 3, "depend": [3, 10], "anaconda": 3, "miniconda": 3, "conda": 3, "solv": [3, 5], "try": 3, "mamba": 3, "mirror": 3, "written": [3, 6, 7, 9], "c": [3, 5, 7, 8], "ll": 3, "env": 3, "name": [3, 4, 6, 7, 10], "ci": 3, "py3": 3, "yml": [3, 10], "correct": [3, 4], "home": 3, "pip": [3, 10], "wa": [3, 4, 5, 9], "success": 3, "start": [3, 5, 6, 7, 8], "__version__": 3, "view": [3, 5], "info": 3, "deactiv": 3, "full": [3, 7], "doc": 3, "manag": [3, 10], "lint": 3, "style": [3, 7], "whenev": 3, "edit": [3, 10], "re": [3, 7, 10], "reflect": 3, "product": [3, 7, 8], "shini": 3, "checkout": 3, "abov": [3, 7, 9, 10], "simplifi": 3, "b": [3, 8, 10], "clear": 3, "what": [3, 7, 9], "bring": 3, "switch": 3, "retriev": 3, "fetch": [3, 5, 7], "latest": 3, "lead": [3, 4, 6], "conflict": 3, "must": [3, 7, 8], "pull": [3, 6], "uncommit": 3, "stash": 3, "prior": 3, "reappli": 3, "pytest": 3, "framework": [3, 4], "ideal": 3, "coverag": 3, "improv": 3, "appreci": 3, "within": [3, 4, 5, 7, 8], "restructur": 3, "text": [3, 6, 8], "rst": 3, "similar": 3, "markdown": 3, "These": [3, 7, 10], "built": 3, "html": 3, "sphinx": 3, "www": 3, "en": 3, "packag": [3, 9, 10], "m": [3, 10], "r": [3, 10], "txt": 3, "feel": 3, "good": [3, 6], "about": [3, 4, 5], "statu": 3, "messag": [3, 7], "when": [3, 6, 7, 10], "appear": [3, 7], "publicli": 3, "push": 3, "default": [3, 7, 10], "given": [3, 4, 6, 7, 8], "v": 3, "banner": 3, "demonstr": [4, 10], "librari": [4, 5, 7, 9], "parallel": [4, 5, 6, 9], "friendli": [4, 5], "motiv": 4, "attempt": [4, 7], "alreadi": [4, 7, 10], "direct": [4, 5, 6], "whether": [4, 6, 7], "over": [4, 5], "nevertheless": 4, "abil": [4, 6], "arrang": [4, 5], "chunk": [4, 5, 7, 8, 9, 10], "concurr": [4, 5], "index": [4, 5, 6, 7, 10], "power": [4, 5], "action": [4, 7], "thread": 4, "independ": 4, "overheard": 4, "system": [4, 5, 6, 7, 10], "account": 4, "speedup": 4, "equal": [4, 7, 8], "core": 4, "leverag": 4, "contrast": 4, "mutipl": 4, "wait": 4, "extern": 4, "latenc": [4, 5, 10], "compar": 4, "domin": [4, 8], "total": [4, 7, 10], "request": [4, 5, 7], "launch": 4, "pai": 4, "overhead": 4, "cost": [4, 5, 6], "togeth": [4, 7], "both": [4, 6, 7, 10], "simultan": 4, "numer": 4, "modern": [4, 6], "int32": 4, "valu": [4, 6, 7, 8, 10], "n": [4, 8, 10], "element": [4, 8], "nx4": 4, "remain": [4, 7], "dimens": [4, 5, 7, 8, 10], "cannot": 4, "represent": 4, "perhap": 4, "through": [4, 7, 9, 10], "doe": [4, 5, 9], "almost": 4, "area": 4, "describ": [4, 5], "relat": 4, "anoth": [4, 10], "attribut": [4, 5, 7, 8, 10], "associ": [4, 8], "backend": [4, 5], "interfac": 4, "could": [4, 7, 8, 9, 10], "ingest": 4, "anywher": [4, 9], "idea": 4, "arbitrari": [4, 5, 8, 10], "map": [4, 7, 10], "pathnam": 4, "static": 4, "whole": [4, 6, 7, 8], "differ": [4, 5, 7, 10], "thu": [4, 10], "assign": [4, 6, 10], "posit": 4, "tree": [4, 7], "two": [4, 6, 7, 10], "small": 4, "special": [4, 7, 10], "zarrai": [4, 8], "zattr": [4, 8], "give": [4, 7], "inform": [4, 5, 7, 9, 10], "shape": [4, 7], "themselv": 4, "expect": [4, 7, 8], "live": [4, 7, 9], "var": [4, 7, 10], "x": [4, 8], "y": [4, 10], "across": [4, 5, 7], "construct": [4, 6, 9], "exact": [4, 7], "filenam": [4, 7], "unifi": 5, "repres": [5, 8], "varieti": [5, 10], "netcdf": [5, 7], "tradit": 5, "flexibl": 5, "potenti": 5, "situ": 5, "without": [5, 6, 10], "copi": [5, 6, 7], "gatewai": 5, "massiv": 5, "while": 5, "still": [5, 6, 10], "insist": 5, "legaci": 5, "archiv": [5, 6, 7], "why": 5, "thing": 5, "serverless": 5, "architectur": 5, "plu": 5, "physic": 5, "gc": 5, "abf": 5, "alibaba": 5, "dropbox": 5, "gdrive": 5, "protocol": [5, 7, 8], "ftp": 5, "ssh": 5, "smb": 5, "variou": 5, "heterogen": 5, "driver": [5, 7], "h5py": [5, 9], "asynchron": 5, "amort": 5, "lock": 5, "logic": [5, 6, 7], "million": 5, "subselect": 5, "scientif": 5, "datatyp": 5, "amount": 5, "observ": 5, "simul": 5, "handl": 5, "download": [5, 6], "era": 5, "answer": 5, "move": [5, 6], "collect": 5, "premis": 5, "hard": [5, 6], "ineffici": 5, "seamlessli": 5, "essenti": 5, "involv": [5, 9], "buffer": [5, 6], "disc": 5, "descript": [5, 6, 7, 8], "wide": 5, "primari": [5, 7], "purpos": 5, "find": [5, 10], "pleas": 5, "Or": 5, "consid": [5, 8, 10], "pydata": 5, "talk": 5, "quick": 5, "tutori": 5, "reader": [5, 7], "studi": 5, "sentinel": 5, "global": [5, 10], "coher": 5, "solar": 5, "dynam": 5, "observatori": 5, "nation": 5, "water": 5, "model": 5, "mur": 5, "sst": 5, "hrrr": 5, "beyond": 5, "non": [5, 7, 8], "zstd": [5, 10], "csv": 5, "parquet": [5, 7], "orc": 5, "feather": 5, "api": [5, 6, 10], "contribut": 5, "bug": 5, "report": 5, "featur": [5, 6, 8], "chang": [5, 7, 10], "advanc": 5, "topic": 5, "modul": [5, 10], "search": [5, 6, 10], "major": 6, "focus": 6, "applic": 6, "structur": [6, 7, 8], "piec": 6, "yet": [6, 7], "develop": [6, 10], "concaten": [6, 7, 10], "space": 6, "transfer": 6, "upon": 6, "happen": [6, 7], "unpack": 6, "simpl": [6, 7, 10], "enough": 6, "wise": 6, "best": 6, "world": 6, "achiev": [6, 10], "sore": 6, "subsequ": 6, "row": 6, "newlin": 6, "charact": [6, 10], "random": 6, "record": [6, 7], "quot": 6, "know": [6, 10], "inde": 6, "misidentif": 6, "failur": 6, "just": [6, 7, 10], "determin": 6, "safe": 6, "embed": [6, 7], "tabular": [6, 7], "sometim": 6, "low": 6, "cardin": 6, "easier": 6, "exclud": 6, "unneed": 6, "month": [6, 10], "001": 6, "column": [6, 7], "wish": [6, 7], "smart": 6, "around": 6, "raw": [6, 7], "serialis": [6, 7], "arrow": 6, "pyarrow": 6, "h5f": [7, 10], "binaryio": 7, "str": [7, 8], "none": 7, "spec": [7, 8], "500": 7, "storage_opt": [7, 9, 10], "error": 7, "warn": 7, "vlen_encod": 7, "emb": 7, "group": [7, 10], "duck": 7, "adher": 7, "uri": 7, "produc": 7, "readm": 7, "zero": [7, 8], "neg": 7, "disabl": 7, "pdb": 7, "ignor": 7, "rais": 7, "null": [7, 8], "leav": 7, "vlen": 7, "16byte": 7, "garbag": 7, "id": [7, 10], "unaffect": 7, "empti": 7, "tabl": 7, "few": 7, "suppli": 7, "fill": [7, 10], "customis": 7, "method": [7, 10], "entri": [7, 8], "No": 7, "scan_grib": 7, "skip": 7, "locat": [7, 10], "common_var": 7, "depr": 7, "keyword": 7, "cf": [7, 10], "typeoflevel": 7, "level": 7, "process_fil": 7, "extens": 7, "primary_attr_to_group": 7, "ordin": 7, "integ": 7, "bool": 7, "tiff_to_zarr": 7, "urlpath": 7, "target_opt": [7, 9, 10], "writer": 7, "max_chunk_s": 7, "kwarg": 7, "scipi": 7, "behav": 7, "valid": [7, 10], "netcdf2": 7, "test": 7, "__init__": 7, "below": [7, 9, 10], "big": [7, 10], "trigger": 7, "subchunk": 7, "never": 7, "000byte": 7, "6000": 7, "split": [7, 9, 10], "biggest": 7, "tbc": 7, "arg": [7, 10], "superclass": 7, "io": 7, "netcdf_fil": 7, "gribcodec": 7, "dtype": [7, 8, 10], "grib": 7, "stream": 7, "eccod": 7, "asciitablecodec": 7, "indtyp": 7, "outdtyp": 7, "ascii": [7, 8], "field": 7, "fillstringscodec": 7, "id_map": 7, "fix": 7, "length": [7, 8], "valul": 7, "opaqu": 7, "16": [7, 10], "vararrcodec": 7, "dt_in": 7, "dt_out": 7, "nrow": 7, "bintabl": 7, "recordarraymemb": 7, "compon": 7, "complex": [7, 10], "desir": 7, "subarrai": 7, "overal": 7, "parsabl": 7, "np": [7, 10], "indict": 7, "coo_map": 7, "coo_dtyp": [7, 10], "identical_dim": [7, 10], "assicu": 7, "expand": [7, 8], "selector": 7, "varnam": 7, "look": 7, "constant": 7, "compil": [7, 10], "regex": [7, 10], "pattern": [7, 10], "exactli": 7, "begin": 7, "attr": [7, 10], "vattr": [7, 10], "far": 7, "appropri": 7, "cftime": 7, "datetim": [7, 10], "automat": 7, "unless": 7, "specifi": [7, 10], "m8": [7, 10], "convers": 7, "beforehand": 7, "signatur": 7, "fn": [7, 10], "counter": 7, "probe": 7, "coerc": 7, "otherwis": 7, "numpi": 7, "vari": [7, 10], "callabl": 7, "act": 7, "drop": [7, 10], "append": [7, 9], "rather": [7, 10], "scratch": 7, "assum": 7, "classmethod": 7, "original_ref": 7, "There": [7, 10], "usag": [7, 10], "amend": 7, "extend": 7, "creation": 7, "merge_var": [7, 10], "ident": [7, 10], "openfil": 7, "open_fil": 7, "concatenate_arrai": 7, "key_seper": 7, "check_arrai": 7, "along": [7, 9, 10], "concatent": 7, "recombin": 7, "single_zarr": 7, "hierarchi": 7, "recreat": 7, "check": 7, "inconsist": 7, "except": 7, "certain": 7, "compat": [7, 8], "single_driv": 7, "single_kwarg": 7, "mzz_kwarg": 7, "n_batch": 7, "output_opt": 7, "run": [7, 9, 10], "client": 7, "mzz": [7, 9, 10], "multpl": 7, "thereof": 7, "ouput": 7, "preprocessor": 7, "remov": 7, "rename_target": 7, "renam": [7, 10], "predict": 7, "templat": [7, 8], "easili": 7, "overrid": 7, "rewrit": 7, "everi": 7, "old": 7, "alter": 7, "rename_target_fil": 7, "url_in": 7, "url_out": 7, "renate_target": 7, "overwrit": 7, "generate_coord": 7, "tag": 7, "probabl": 7, "tupl": 7, "order": 7, "factor": 7, "largest": 7, "deep": 7, "divisor": 7, "modifi": 7, "threshold": [7, 10], "replac": [7, 9, 10], "short": [7, 10], "base64": [7, 8], "larger": [7, 10], "inline_arrai": 7, "fewer": 7, "constitu": 7, "prevent": 7, "dot": 7, "irrespect": 7, "fo": [7, 9, 10], "100000": 7, "categorical_threshold": 7, "mimic": 7, "normal": 7, "datafram": 7, "writabl": 7, "deriv": 7, "extra": 7, "10000": [7, 8], "bigger": 7, "categor": 7, "ratio": 7, "greater": 7, "kind": 8, "prototyp": 8, "key0": 8, "key1": 8, "target_url": 8, "item": [8, 10], "propos": 8, "zgroup": 8, "zarr_format": 8, "convent": [8, 10], "ugrid": 8, "9": [8, 10], "_array_dimens": 8, "node": 8, "9228245": 8, "f8": 8, "bucket": 8, "294094376": 8, "73825960": 8, "gen": 8, "ought": 8, "previou": 8, "definit": 8, "enhanc": 8, "template_nam": 8, "jinja": 8, "variable_nam": 8, "OR": 8, "key_nam": 8, "render": 8, "jinja2": 8, "equival": 8, "liter": [8, 10], "annot": 8, "server": 8, "domain": 8, "gen_kei": 8, "5": [8, 10], "key2": 8, "key3": 8, "cartesian": 8, "evalu": 8, "gen_key0": 8, "path_0": 8, "gen_key1": 8, "path_1": 8, "2000": [8, 10], "gen_key2": 8, "path_2": 8, "3000": 8, "gen_key3": 8, "path_3": 8, "4000": 8, "gen_key4": 8, "path_4": 8, "5000": 8, "emsembl": 9, "defin": [9, 10], "urllist": 9, "simpli": 9, "redo": 9, "p": 9, "nwm": 9, "retro": 9, "v2": 9, "pd": [9, 10], "full_phys": 9, "2017": 9, "201704010000": 9, "chrtout_domain1": 9, "comp": 9, "201704010100": 9, "201704010200": 9, "201704010300": 9, "201704010400": 9, "201704010500": 9, "201704010600": 9, "201704010700": 9, "201704010800": 9, "201704010900": 9, "default_fill_cach": [9, 10], "default_cache_typ": [9, 10], "inf": 9, "h5chunk": [9, 10], "ensembl": [9, 10], "again": 9, "crucial": 9, "restrict": 9, "person": 9, "initi": [9, 10], "analysi": 9, "veloc": 9, "invoc": 9, "littl": [9, 10], "declar": 9, "catalog": [9, 10], "line": 9, "mapper": 9, "previous": [9, 10], "earth": [9, 10], "scienc": 9, "partner": 9, "2022": [9, 10], "winter": 9, "meet": 9, "free": [9, 10], "environ": 9, "lsterzing": 9, "esip": [9, 10], "intend": 10, "displai": 10, "pair": 10, "era5": 10, "replic": 10, "public": 10, "aw": 10, "phase": 10, "frequent": 10, "manual": 10, "upload": 10, "destin": 10, "flist": 10, "2020": 10, "air_pressure_at_mean_sea_level": 10, "sea_surface_temperatur": 10, "fs2": 10, "pathlib": 10, "rb": 10, "avoid": 10, "lower": 10, "gen_json": 10, "file_url": 10, "infil": 10, "300": 10, "adjust": 10, "higher": 10, "outf": 10, "wb": 10, "dramat": 10, "30": 10, "sy": 10, "74": 10, "35": 10, "wall": 10, "14min": 10, "44": 10, "anonym": 10, "01_air_pressure_at_mean_sea_level": 10, "print": 10, "time0": 10, "744": 10, "lat": 10, "721": 10, "lon": 10, "1440": 10, "float32": 10, "90": 10, "89": 10, "75": 10, "nan": 10, "25": 10, "359": 10, "datetime64": 10, "01": 10, "202": 10, "chunksiz": 10, "24": 10, "meta": 10, "ndarrai": 10, "institut": 10, "ecmwf": 10, "reanalysi": 10, "titl": 10, "forecast": 10, "162": 10, "17": 10, "180": 10, "235": 10, "concat": 10, "json_list": 10, "_air_pressure_at_mean_sea_level": 10, "air_pressure_at_mean_sea_level_combin": 10, "reveal": 10, "span": 10, "backend_arg": 10, "42": 10, "128": 10, "\u00b5": 10, "40": 10, "new_dim": 10, "new_dimens": 10, "ex": 10, "_air": 10, "popul": 10, "custom": 10, "regular": 10, "express": 10, "cgl_toc_yyyymmddhhmm_x21y05_s3a_v1": 10, "fn_to_tim": 10, "subst": 10, "12": 10, "strptime": 10, "h": 10, "sort": 10, "iglob": 10, "3360": 10, "float64": 10, "34": 10, "99": 10, "39": 10, "11t07": 10, "00": 10, "53": 10, "ac_process_flag": 10, "168": 10, "oa02_toc": 10, "vza_olci": 10, "6": 10, "archive_facil": 10, "vito": 10, "copyright": 10, "copernicu": 10, "servic": 10, "similarli": 10, "02": 10, "29t23": 10, "shortnameecmwf": 10, "msl": 10, "01_sea_surface_temperatur": 10, "pre": 10, "pre_process": 10, "k": 10, "startswith": 10, "pop": 10, "vars_combin": 10, "02_sea_surface_temperatur": 10, "sea_surface_temperature_combin": 10, "696": 10, "29": 10, "post": 10, "fill_valu": 10, "manipul": 10, "modify_fill_valu": 10, "out_": 10, "999": 10, "referenc": 10, "43": 10, "year": 10, "sidecar": 10, "8gb": 10, "194mb": 10, "7gb": 10, "qhub": 10, "era5_2020_2022_multivar": 10, "zst": 10, "era5_1979_2022_multivar": 10, "380568": 10, "1979": 10, "air_temperature_at_2_metr": 10, "dew_point_temperature_at_2_metr": 10, "eastward_wind_at_100_metr": 10, "eastward_wind_at_10_metr": 10, "lwe_thickness_of_surface_snow_amount": 10, "northward_wind_at_100_metr": 10, "48": 10, "61": 10, "54": 10, "1min": 10, "reason": 10, "suggest": 10, "hide": 10, "open_catalog": 10, "intake_catalog": 10, "to_dask": 10, "shop": 10, "da": 10, "sel": 10, "2021": 10, "01t00": 10, "plot": 10, "79": 10, "382": 10, "18": 10, "22": 10, "198": 10, "slice": 10, "31": 10, "92": 10, "663": 10}, "objects": {"kerchunk.codecs": [[7, 0, 1, "", "AsciiTableCodec"], [7, 0, 1, "", "FillStringsCodec"], [7, 0, 1, "", "GRIBCodec"], [7, 0, 1, "", "RecordArrayMember"], [7, 0, 1, "", "VarArrCodec"]], "kerchunk.codecs.AsciiTableCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.FillStringsCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.GRIBCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.RecordArrayMember": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.VarArrCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.combine": [[7, 0, 1, "", "MultiZarrToZarr"], [7, 2, 1, "", "auto_dask"], [7, 2, 1, "", "concatenate_arrays"], [7, 2, 1, "", "drop"], [7, 2, 1, "", "merge_vars"]], "kerchunk.combine.MultiZarrToZarr": [[7, 1, 1, "", "__init__"], [7, 1, 1, "", "append"], [7, 1, 1, "", "translate"]], "kerchunk.df": [[7, 2, 1, "", "refs_to_dataframe"]], "kerchunk.fits": [[7, 2, 1, "", "process_file"]], "kerchunk.grib2": [[7, 2, 1, "", "scan_grib"]], "kerchunk.hdf": [[7, 0, 1, "", "SingleHdf5ToZarr"]], "kerchunk.hdf.SingleHdf5ToZarr": [[7, 1, 1, "", "translate"]], "kerchunk.netCDF3": [[7, 0, 1, "", "NetCDF3ToZarr"]], "kerchunk.netCDF3.NetCDF3ToZarr": [[7, 1, 1, "", "__init__"], [7, 1, 1, "", "translate"]], "kerchunk.tiff": [[7, 2, 1, "", "generate_coords"], [7, 2, 1, "", "tiff_to_zarr"]], "kerchunk.utils": [[7, 2, 1, "", "consolidate"], [7, 2, 1, "", "dereference_archives"], [7, 2, 1, "", "do_inline"], [7, 2, 1, "", "inline_array"], [7, 2, 1, "", "rename_target"], [7, 2, 1, "", "rename_target_files"], [7, 2, 1, "", "subchunk"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"]}, "titleterms": {"advanc": 0, "topic": 0, "us": [0, 6, 9, 10], "dask": 0, "simpl": 0, "parallel": 0, "tree": 0, "reduct": 0, "archiv": 0, "file": [0, 7, 9, 10], "parquet": [0, 6], "storag": 0, "beyond": 1, "python": [1, 3], "case": 2, "studi": 2, "sentinel": 2, "global": 2, "coher": 2, "solar": 2, "dynam": 2, "observatori": 2, "nation": 2, "water": 2, "model": 2, "mur": 2, "sst": 2, "hrrr": 2, "contribut": 3, "kerchunk": [3, 5, 10], "bug": 3, "report": 3, "featur": 3, "request": 3, "code": 3, "creat": 3, "fork": 3, "develop": 3, "environ": 3, "set": 3, "up": 3, "pre": 3, "commit": 3, "branch": 3, "run": 3, "test": 3, "suit": 3, "document": 3, "chang": 3, "detail": 4, "descript": 4, "binari": 4, "buffer": 4, "fsspec": 4, "virtual": 4, "filesystem": 4, "zarr": [4, 6], "reader": 4, "introduct": 5, "content": 5, "indic": 5, "tabl": 5, "non": 6, "tar": 6, "zstd": 6, "csv": 6, "json": [6, 9, 10], "orc": 6, "feather": 6, "api": 7, "refer": [7, 8], "format": 7, "backend": 7, "codec": 7, "combin": [7, 10], "util": 7, "specif": 8, "version": 8, "0": 8, "1": 8, "quick": 9, "start": 9, "singl": [9, 10], "multi": 9, "output": [9, 10], "exampl": 9, "tutori": [9, 10], "notebook": 9, "multipl": 10, "dataset": 10, "logic": 10, "aggreg": 10, "coo_map": 10, "merg": 10, "variabl": 10, "across": 10, "preprocess": 10, "postprocess": 10}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 60}, "alltitles": {"Advanced Topics": [[0, "advanced-topics"]], "Using Dask": [[0, "using-dask"]], "Simple parallel": [[0, "simple-parallel"]], "Tree reduction": [[0, "tree-reduction"]], "Archive Files": [[0, "archive-files"]], "Parquet Storage": [[0, "parquet-storage"]], "Beyond Python": [[1, "beyond-python"]], "Case studies": [[2, "case-studies"]], "Sentinel Global coherence": [[2, "sentinel-global-coherence"]], "Solar Dynamics Observatory": [[2, "solar-dynamics-observatory"]], "National Water Model": [[2, "national-water-model"]], "MUR SST": [[2, "mur-sst"]], "HRRR": [[2, "hrrr"]], "Contributing to kerchunk": [[3, "contributing-to-kerchunk"]], "Bug reports and feature requests": [[3, "bug-reports-and-feature-requests"]], "Contributing code": [[3, "contributing-code"]], "Creating a Fork": [[3, "creating-a-fork"]], "Creating a development environment": [[3, "creating-a-development-environment"]], "Creating a Python Environment": [[3, "creating-a-python-environment"]], "Setting up pre-commit": [[3, "setting-up-pre-commit"]], "Creating a branch": [[3, "creating-a-branch"]], "Running the test suite": [[3, "running-the-test-suite"]], "Contributing documentation": [[3, "contributing-documentation"]], "Contributing changes": [[3, "contributing-changes"]], "Detailed description": [[4, "detailed-description"]], "Binary buffers": [[4, "binary-buffers"]], "fsspec virtual filesystem": [[4, "fsspec-virtual-filesystem"]], "zarr reader": [[4, "zarr-reader"]], "kerchunk": [[5, "kerchunk"]], "Introduction": [[5, "introduction"]], "Contents:": [[5, null]], "Indices and tables": [[5, "indices-and-tables"]], "Non-zarr uses": [[6, "non-zarr-uses"]], ".tar.zstd": [[6, "tar-zstd"]], ".csv/.json": [[6, "csv-json"]], "parquet/orc/feather": [[6, "parquet-orc-feather"]], "API Reference": [[7, "api-reference"]], "File format backends": [[7, "file-format-backends"]], "Codecs": [[7, "codecs"]], "Combining": [[7, "combining"]], "Utilities": [[7, "utilities"]], "References specification": [[8, "references-specification"]], "Version 0": [[8, "version-0"]], "Version 1": [[8, "version-1"]], "Quick Start": [[9, "quick-start"]], "Single file JSONs": [[9, "single-file-jsons"], [10, "single-file-jsons"]], "Multi-file JSONs": [[9, "multi-file-jsons"]], "Using the output": [[9, "using-the-output"], [10, "using-the-output"]], "Example/Tutorial Notebook": [[9, "example-tutorial-notebook"]], "Tutorial": [[10, "tutorial"]], "Combine multiple kerchunked datasets into a single logical aggregate dataset": [[10, "combine-multiple-kerchunked-datasets-into-a-single-logical-aggregate-dataset"]], "Using coo_map": [[10, "using-coo-map"]], "Merging variables across jsons": [[10, "merging-variables-across-jsons"]], "Preprocessing": [[10, "preprocessing"]], "Postprocessing": [[10, "postprocessing"]]}, "indexentries": {"asciitablecodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.AsciiTableCodec"]], "fillstringscodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.FillStringsCodec"]], "gribcodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.GRIBCodec"]], "multizarrtozarr (class in kerchunk.combine)": [[7, "kerchunk.combine.MultiZarrToZarr"]], "netcdf3tozarr (class in kerchunk.netcdf3)": [[7, "kerchunk.netCDF3.NetCDF3ToZarr"]], "recordarraymember (class in kerchunk.codecs)": [[7, "kerchunk.codecs.RecordArrayMember"]], "singlehdf5tozarr (class in kerchunk.hdf)": [[7, "kerchunk.hdf.SingleHdf5ToZarr"]], "vararrcodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.VarArrCodec"]], "__init__() (kerchunk.codecs.asciitablecodec method)": [[7, "kerchunk.codecs.AsciiTableCodec.__init__"]], "__init__() (kerchunk.codecs.fillstringscodec method)": [[7, "kerchunk.codecs.FillStringsCodec.__init__"]], "__init__() (kerchunk.codecs.gribcodec method)": [[7, "kerchunk.codecs.GRIBCodec.__init__"]], "__init__() (kerchunk.codecs.recordarraymember method)": [[7, "kerchunk.codecs.RecordArrayMember.__init__"]], "__init__() (kerchunk.codecs.vararrcodec method)": [[7, "kerchunk.codecs.VarArrCodec.__init__"]], "__init__() (kerchunk.combine.multizarrtozarr method)": [[7, "kerchunk.combine.MultiZarrToZarr.__init__"]], "__init__() (kerchunk.netcdf3.netcdf3tozarr method)": [[7, "kerchunk.netCDF3.NetCDF3ToZarr.__init__"]], "append() (kerchunk.combine.multizarrtozarr class method)": [[7, "kerchunk.combine.MultiZarrToZarr.append"]], "auto_dask() (in module kerchunk.combine)": [[7, "kerchunk.combine.auto_dask"]], "concatenate_arrays() (in module kerchunk.combine)": [[7, "kerchunk.combine.concatenate_arrays"]], "consolidate() (in module kerchunk.utils)": [[7, "kerchunk.utils.consolidate"]], "dereference_archives() (in module kerchunk.utils)": [[7, "kerchunk.utils.dereference_archives"]], "do_inline() (in module kerchunk.utils)": [[7, "kerchunk.utils.do_inline"]], "drop() (in module kerchunk.combine)": [[7, "kerchunk.combine.drop"]], "generate_coords() (in module kerchunk.tiff)": [[7, "kerchunk.tiff.generate_coords"]], "inline_array() (in module kerchunk.utils)": [[7, "kerchunk.utils.inline_array"]], "merge_vars() (in module kerchunk.combine)": [[7, "kerchunk.combine.merge_vars"]], "process_file() (in module kerchunk.fits)": [[7, "kerchunk.fits.process_file"]], "refs_to_dataframe() (in module kerchunk.df)": [[7, "kerchunk.df.refs_to_dataframe"]], "rename_target() (in module kerchunk.utils)": [[7, "kerchunk.utils.rename_target"]], "rename_target_files() (in module kerchunk.utils)": [[7, "kerchunk.utils.rename_target_files"]], "scan_grib() (in module kerchunk.grib2)": [[7, "kerchunk.grib2.scan_grib"]], "subchunk() (in module kerchunk.utils)": [[7, "kerchunk.utils.subchunk"]], "tiff_to_zarr() (in module kerchunk.tiff)": [[7, "kerchunk.tiff.tiff_to_zarr"]], "translate() (kerchunk.combine.multizarrtozarr method)": [[7, "kerchunk.combine.MultiZarrToZarr.translate"]], "translate() (kerchunk.hdf.singlehdf5tozarr method)": [[7, "kerchunk.hdf.SingleHdf5ToZarr.translate"]], "translate() (kerchunk.netcdf3.netcdf3tozarr method)": [[7, "kerchunk.netCDF3.NetCDF3ToZarr.translate"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["advanced", "beyond", "cases", "contributing", "detail", "index", "nonzarr", "reference", "spec", "test_example", "tutorial"], "filenames": ["advanced.rst", "beyond.rst", "cases.rst", "contributing.rst", "detail.rst", "index.rst", "nonzarr.rst", "reference.rst", "spec.rst", "test_example.rst", "tutorial.rst"], "titles": ["Advanced Topics", "Beyond Python", "Case studies", "Contributing to kerchunk", "Detailed description", "kerchunk", "Non-zarr uses", "API Reference", "References specification", "Quick Start", "Tutorial"], "terms": {"scan": [0, 4, 6, 9], "combin": [0, 3, 5, 8, 9], "dataset": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], "computation": 0, "intens": [0, 4], "mai": [0, 3, 6, 7, 8], "requir": [0, 1, 2, 3, 7, 8, 10], "lot": [0, 6], "bandwidth": 0, "some": [0, 1, 4, 5, 6, 7, 9], "data": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10], "format": [0, 1, 2, 4, 5, 6, 8], "where": [0, 3, 4, 5, 6, 7, 8, 9, 10], "target": [0, 1, 7], "contain": [0, 3, 7, 8, 10], "mani": [0, 2, 3, 4, 5, 7, 8], "input": [0, 7, 8, 10], "make": [0, 3, 4, 6, 8], "sens": 0, "parallelis": 0, "job": 0, "mayb": [0, 1, 7], "distribut": [0, 4, 6, 7], "workload": [0, 4], "cluster": [0, 4, 7], "get": [0, 4, 6, 7], "addit": [0, 5, 9], "cpu": [0, 4, 10], "network": [0, 5], "perform": [0, 6, 7], "The": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10], "simplest": [0, 10], "case": [0, 4, 5, 6, 7, 9, 10], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "process": [0, 4, 5, 6, 7, 10], "individu": [0, 7, 9, 10], "let": 0, "": [0, 3, 6, 7, 8, 10], "sai": 0, "you": [0, 1, 3, 4, 5, 7, 9], "have": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], "list": [0, 2, 7, 10], "need": [0, 1, 2, 3, 4, 5, 6, 7, 9], "encapsul": 0, "each": [0, 1, 2, 4, 7, 8, 9, 10], "singl": [0, 2, 4, 5, 7], "function": [0, 3, 7, 10], "In": [0, 3, 4, 5, 6, 9, 10], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], "mode": [0, 8, 10], "typic": [0, 10], "save": [0, 6, 7, 9, 10], "output": [0, 5, 7], "although": [0, 7, 8], "return": [0, 3, 7, 8, 10], "them": [0, 2, 3, 4, 5, 6, 7, 9, 10], "ok": [0, 7], "too": [0, 5, 8], "especi": 0, "mean": [0, 4, 5, 6, 7, 9], "immedi": 0, "here": [0, 2, 3, 6, 8, 10], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], "exampl": [0, 1, 2, 3, 4, 5, 7, 8, 10], "hdf5": [0, 2, 4, 5, 7, 9], "caller": 0, "should": [0, 1, 3, 6, 7, 8, 9, 10], "sure": [0, 3], "option": [0, 7, 8], "ani": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], "paramet": [0, 7], "transform": 0, "ar": [0, 2, 3, 4, 5, 6, 7, 8, 10], "place": [0, 4, 7, 10], "import": [0, 3, 9, 10], "ujson": [0, 7, 10], "fsspec": [0, 2, 3, 5, 7, 8, 9, 10], "def": [0, 10], "url": [0, 1, 4, 7, 8, 9, 10], "outputfil": 0, "storage_options_in": [0, 7], "storage_options_out": [0, 7], "kerchunk": [0, 2, 4, 7, 9], "hdf": [0, 4, 5, 7, 9, 10], "singlehdf5tozarr": [0, 7, 9, 10], "ref": [0, 7, 8, 10], "translat": [0, 5, 7, 9, 10], "open": [0, 3, 7, 9, 10], "wt": 0, "f": [0, 7, 8, 10], "dump": [0, 10], "task": 0, "delai": 0, "u": [0, 2, 4, 8, 9, 10], "o": [0, 10], "zip": [0, 7], "infilenam": 0, "outfilenam": 0, "comput": [0, 5, 10], "itself": [0, 8], "slow": [0, 8], "memori": [0, 2, 5, 7, 8, 9, 10], "hungri": 0, "refer": [0, 1, 2, 3, 4, 5, 9, 10], "set": [0, 1, 7, 8, 9], "batch": [0, 7], "which": [0, 3, 4, 6, 7, 8, 9, 10], "reduc": [0, 7], "redund": 0, "between": [0, 3, 10], "result": [0, 7, 9, 10], "techniqu": [0, 4], "known": [0, 6, 7, 10], "do": [0, 4, 7, 9], "hand": 0, "seen": 0, "we": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], "also": [0, 2, 5, 7, 8, 10], "provid": [0, 5, 7, 8, 9, 10], "auto_dask": [0, 7], "conveni": [0, 10], "one": [0, 3, 4, 5, 6, 7, 8, 9, 10], "stop": [0, 7, 8, 10], "call": [0, 7], "those": [0, 4], "final": [0, 7, 10], "take": [0, 3, 4, 5, 8, 10], "number": [0, 4, 5, 7, 8, 10], "dict": [0, 7, 8, 9, 10], "argument": [0, 7, 9, 10], "user": [0, 3, 5, 7, 9, 10], "consult": 0, "docstr": 0, "specif": [0, 1, 3, 4, 5, 7, 10], "class": [0, 7, 8], "decod": [0, 5, 7, 8], "multizarrtozarr": [0, 7, 9, 10], "note": [0, 2, 7, 8, 9], "preprocess": [0, 7], "befor": [0, 3, 7, 10], "stage": [0, 7], "postprocess": [0, 7], "onli": [0, 2, 3, 4, 5, 6, 7, 10], "after": [0, 3, 4, 7], "It": [0, 1, 3, 5, 7, 8, 10], "often": [0, 5, 6, 7], "wrap": [0, 7], "multipl": [0, 4, 5, 7, 8, 9], "tar": [0, 5, 7], "If": [0, 3, 4, 6, 7, 8], "support": [0, 4, 5, 10], "thei": [0, 3, 4, 6, 7, 8, 10], "directli": [0, 4, 7, 10], "someth": 0, "like": [0, 1, 4, 5, 6, 7, 8, 10], "netcdf3": [0, 7, 10], "netcdf3tozarr": [0, 7], "myfil": 0, "nc": [0, 8, 10], "inline_threshold": [0, 7, 9, 10], "0": [0, 5, 7, 9, 10], "out": [0, 6, 7, 8, 9, 10], "member": [0, 7], "local": [0, 3, 5, 6, 7, 10], "turn": [0, 7], "off": [0, 3], "inlin": [0, 7, 10], "done": [0, 6, 7, 9], "later": 0, "util": [0, 5], "do_inlin": [0, 7], "come": [0, 8], "At": 0, "point": [0, 3, 7, 9], "gener": [0, 2, 7, 9, 10], "problemat": 0, "load": [0, 1, 5, 7, 8, 10], "so": [0, 2, 3, 4, 5, 6, 7, 9, 10], "rang": [0, 5, 7], "origin": [0, 3, 4, 5, 7, 10], "instead": [0, 7, 8, 10], "back": 0, "nomin": 0, "form": [0, 8, 10], "readi": [0, 3], "autom": 0, "step": [0, 3, 7, 8, 9], "futur": [0, 8], "out2": 0, "dereference_arch": [0, 7], "100": [0, 7, 8, 9, 10], "consolid": [0, 5, 7, 8, 9, 10], "now": [0, 3, 5, 8, 10], "all": [0, 2, 4, 5, 6, 7, 8, 9, 10], "For": [0, 3, 4, 5, 6, 7, 8, 10], "uncompress": [0, 7], "access": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10], "wai": [0, 5, 7, 10], "json": [0, 1, 4, 5, 7, 8], "veri": [0, 5, 6], "becaus": [0, 6, 7, 8, 10], "human": 0, "readabl": 0, "ubiquit": 0, "howev": [0, 6], "most": 0, "effici": [0, 4, 5, 6, 7], "term": [0, 6], "size": [0, 1, 2, 4, 7, 8, 10], "pars": [0, 6], "speed": 0, "python": [0, 4, 5, 7, 8, 10], "particular": [0, 1, 4, 6, 10], "ad": [0, 3], "downsid": 0, "repeat": 0, "string": [0, 6, 7, 8, 10], "becom": [0, 7, 8], "separ": [0, 2, 5, 7], "instanc": [0, 4, 6, 7, 8, 10], "greatli": 0, "inflat": 0, "footprint": [0, 7], "time": [0, 2, 3, 4, 7, 9, 10], "To": [0, 1, 3, 5, 7, 10], "overcom": 0, "problem": [0, 5], "keep": [0, 3, 7, 9], "down": 0, "end": [0, 3, 6], "convert": [0, 7], "store": [0, 1, 3, 4, 5, 6, 7, 8, 10], "implement": [0, 1, 7, 8], "referencefilesystem": [0, 4, 8], "altern": [0, 8, 9, 10], "new": [0, 3, 5, 7, 8, 10], "design": [0, 4, 5], "work": [0, 2, 3, 5, 7, 8], "principl": 0, "benefit": 0, "path": [0, 3, 4, 7, 8, 9, 10], "much": [0, 5, 6], "more": [0, 1, 3, 4, 5, 6, 7, 8, 10], "compact": 0, "2x": 0, "smaller": [0, 7, 10], "than": [0, 4, 7, 8], "compress": [0, 4, 5, 6, 8, 10], "10x": 0, "correspondingli": 0, "faster": [0, 10], "instanti": 0, "filesystem": [0, 5, 7, 9, 10], "sinc": [0, 6, 8, 9], "taken": [0, 4, 7, 10], "byte": [0, 4, 5, 7, 8], "e": [0, 3, 4, 5, 7, 8], "g": [0, 4, 5, 7], "int": [0, 7, 8], "28": 0, "arrai": [0, 4, 5, 6, 7, 8, 10], "4": [0, 4, 8, 10], "8": [0, 3, 10], "lazi": [0, 7, 10], "partit": [0, 6], "kei": [0, 1, 7, 8, 10], "variabl": [0, 2, 4, 7, 8], "actual": [0, 6, 7], "dictionari": [0, 7, 10], "encod": [0, 1, 4, 5, 7, 10], "per": [0, 4, 7], "uniqu": [0, 7, 10], "onc": [0, 3, 4, 6, 10], "df": [0, 7], "refs_to_datafram": [0, 7], "exist": [0, 5, 7, 8], "care": 0, "read": [0, 4, 5, 6, 7, 9], "recommend": [0, 7, 8, 9, 10], "understand": [0, 5], "oper": [0, 4, 10], "therefor": 0, "larg": [0, 3, 4, 5, 6, 7], "merg": [0, 3, 7], "abl": [0, 3, 4, 6], "via": [0, 1, 5, 6], "A": [0, 9, 10], "concret": 0, "workflow": [0, 7, 10], "follow": [0, 5, 8, 9, 10], "execut": [0, 7], "first": [0, 3, 4, 6, 7, 8, 9, 10], "three": [0, 8], "go": [0, 5], "avail": [0, 2, 7, 10], "from": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], "lazyreferencemapp": [0, 7, 8], "tempfil": 0, "temporarydirectori": 0, "xarrai": [0, 3, 9, 10], "xr": [0, 9, 10], "location_of_data": 0, "creat": [0, 2, 4, 5, 7, 8, 9, 10], "pass": [0, 4, 7, 8, 10], "makedir": 0, "parq": [0, 8], "record_s": [0, 7, 8], "1000": [0, 7, 8], "root": [0, 7], "single_ref_set": 0, "_": [0, 3, 8, 10], "out_dict": 0, "remote_protocol": [0, 7, 9, 10], "s3": [0, 2, 5, 8, 9, 10], "concat_dim": [0, 7, 9, 10], "remote_opt": [0, 7, 9, 10], "anon": [0, 9, 10], "true": [0, 4, 7, 9, 10], "flush": 0, "target_protocol": [0, 7], "d": [0, 7, 8, 9, 10], "open_dataset": [0, 9, 10], "get_mapp": [0, 10], "engin": [0, 9, 10], "zarr": [0, 1, 5, 7, 8, 9, 10], "backend_kwarg": [0, 9, 10], "fals": [0, 7, 9, 10], "ha": [0, 3, 6, 7, 8, 10], "metadata": [0, 2, 4, 5, 6, 7, 8], "coordin": [0, 5, 7, 10], "main": [0, 2, 3, 7], "correspond": [0, 8], "been": [0, 7, 8, 10], "touch": 0, "even": [0, 3, 6, 9, 10], "500mb": 0, "As": 0, "demand": 0, "cach": [0, 10], "worker": [0, 7], "file": [1, 2, 3, 4, 5, 6, 8], "current": [1, 3, 5, 7, 10], "readili": [1, 4], "interpret": [1, 3, 7, 8], "languag": [1, 4], "document": [1, 2, 5, 6, 10], "detail": [1, 5], "content": [1, 6, 7, 8], "either": [1, 3, 10], "binari": [1, 5, 6, 7, 8, 10], "offset": [1, 6, 8], "long": [1, 3, 6], "being": [1, 3, 7], "us": [1, 2, 3, 4, 5, 7, 8], "type": [1, 3, 4, 5, 7, 10], "block": [1, 4, 5, 6, 7, 8, 10], "part": [1, 3, 4, 6], "well": [1, 8], "whichev": 1, "codec": [1, 4, 5], "noth": 1, "plain": 1, "common": [1, 7], "compressor": [1, 6, 8], "gzip": 1, "might": [1, 4, 8, 9], "would": [1, 7, 8, 9, 10], "write": [1, 4, 7, 9, 10], "code": [1, 2, 5, 8, 9, 10], "expos": [1, 3], "storag": [1, 4, 5, 6, 7, 8, 10], "object": [1, 5, 7, 8, 10], "One": 1, "j": 1, "appli": [1, 5, 7, 10], "multi": [1, 5], "scale": 1, "tiff": [1, 5, 7, 10], "microscopi": 1, "found": [1, 3, 7, 9], "http": [1, 2, 3, 4, 5, 8, 9, 10], "observablehq": 1, "com": [1, 2, 3, 9, 10], "manzt": 1, "om": 1, "filesystemrefer": 1, "complet": [2, 4, 5], "reproduc": 2, "made": [2, 4, 7], "link": [2, 9], "possibli": 2, "notebook": [2, 5], "benchmark": 2, "page": [2, 3, 5], "progress": 2, "repo": [2, 3, 7], "intak": [2, 9, 10], "catalogu": 2, "clean": 2, "up": [2, 4, 8], "standard": [2, 7], "nativ": 2, "geotiff": [2, 7], "effect": [2, 3, 5, 6, 8], "400tb": 2, "1": [2, 5, 7, 10], "earthbigdata": 2, "websit": 2, "west": [2, 10], "2": [2, 7, 8, 10], "amazonaw": 2, "discuss": 2, "github": [2, 3, 9, 10], "issu": [2, 3, 10], "78": 2, "script": [2, 10], "cgohlk": 2, "tifffil": [2, 7], "blob": 2, "v2021": 2, "10": [2, 7, 10], "py": [2, 3], "nbviewer": 2, "org": [2, 3], "ipynb": 2, "fit": [2, 4, 5, 7, 10], "400gb": 2, "wavelength": 2, "filter": [2, 7], "present": [2, 4, 6, 9], "date": [2, 3], "ob": 2, "nearest": 2, "preced": 2, "94a": 2, "imag": [2, 4], "other": [2, 4, 5, 6, 7, 8, 10], "maintain": 2, "axi": [2, 7], "sdo": 2, "netcdf4": [2, 4, 5, 9, 10], "80tb": 2, "dask": [2, 4, 5, 7, 10], "tee": 2, "reduct": 2, "were": [2, 4, 6, 7], "aggreg": [2, 4, 5, 7], "gist": 2, "rsignel": 2, "usg": 2, "ef435a53ac530a2843ce7e1d59f96e22": 2, "02da7d9257b4b26d84d053be1af2ceeb": 2, "66tb": 2, "On": 2, "disk": [2, 4, 8], "16tb": 2, "podaac": 2, "jpl": 2, "nasa": 2, "gov": 2, "l4": 2, "glob": [2, 10], "v4": 2, "cgentemann": 2, "cloud_scienc": 2, "master": [2, 3], "zarr_meta": 2, "cloud_mur_v41_benchmark": 2, "sea": 2, "surfac": 2, "temperatur": 2, "includ": [2, 5, 6, 7, 8, 10], "see": [2, 3, 4, 7, 9], "how": [2, 3, 4, 5, 7, 9], "establish": 2, "earthdata": 2, "credenti": [2, 3, 10], "necessari": [2, 3, 10], "grib2": [2, 5, 7, 10], "5gb": 2, "11": 2, "subset": 2, "rapidrefresh": 2, "noaa": [2, 9], "peterm790": 2, "92eb1df3d58ba41d3411f8a840be2452": 2, "high": 2, "resolut": 2, "rapid": 2, "refresh": 2, "real": [2, 7], "3": [2, 3, 8, 10], "km": 2, "hourli": 2, "updat": [2, 3, 7], "cloud": [2, 4, 5, 9], "resolv": [2, 3], "convect": 2, "allow": [2, 4, 5, 6, 7, 10], "atmospher": 2, "extract": [2, 4, 5, 7, 10], "section": [2, 8], "match": [2, 8, 10], "heightaboveground": [2, 7], "base": [3, 9], "guid": 3, "panda": [3, 7], "xbatcher": 3, "activ": 3, "break": 3, "introduc": 3, "its": [3, 8, 10], "state": 3, "possibl": [3, 4, 5, 6, 8, 10], "experienc": 3, "build": 3, "indic": [3, 6, 8], "todai": 3, "continu": 3, "usual": [3, 7], "don": 3, "t": 3, "instal": [3, 9, 10], "project": [3, 4], "submit": 3, "repositori": 3, "git": [3, 10], "version": [3, 5, 7], "control": 3, "track": 3, "instruct": 3, "setup": 3, "your": 3, "next": [3, 9], "navig": 3, "click": 3, "button": 3, "top": [3, 7, 8], "right": 3, "own": 3, "want": [3, 4, 7], "clone": 3, "onto": 3, "machin": [3, 4, 10], "termin": [3, 6], "command": [3, 9], "prompt": 3, "yourusernam": 3, "cd": 3, "remot": [3, 7, 8, 9, 10], "add": [3, 7, 8], "upstream": 3, "directori": [3, 4, 6, 7], "connect": 3, "sourc": [3, 5, 7, 10], "ensur": 3, "same": [3, 4, 6, 7, 10], "everyon": 3, "els": 3, "virtual": [3, 5, 6, 9, 10], "isol": 3, "depend": [3, 10], "anaconda": 3, "miniconda": 3, "conda": 3, "solv": [3, 5], "try": 3, "mamba": 3, "mirror": 3, "written": [3, 6, 7, 9], "c": [3, 5, 7, 8], "ll": 3, "env": 3, "name": [3, 4, 6, 7, 8, 10], "ci": 3, "py3": 3, "yml": [3, 10], "correct": [3, 4], "home": 3, "pip": [3, 10], "wa": [3, 4, 5, 9], "success": 3, "start": [3, 5, 6, 7, 8], "__version__": 3, "view": [3, 5], "info": 3, "deactiv": 3, "full": [3, 7], "doc": 3, "manag": [3, 10], "lint": 3, "style": [3, 7], "whenev": 3, "edit": [3, 10], "re": [3, 7, 10], "reflect": 3, "product": [3, 7, 8], "shini": 3, "checkout": 3, "abov": [3, 7, 9, 10], "simplifi": 3, "b": [3, 8, 10], "clear": 3, "what": [3, 7, 9], "bring": 3, "switch": 3, "retriev": 3, "fetch": [3, 5, 7], "latest": 3, "lead": [3, 4, 6], "conflict": 3, "must": [3, 7, 8], "pull": [3, 6], "uncommit": 3, "stash": 3, "prior": 3, "reappli": 3, "pytest": 3, "framework": [3, 4], "ideal": 3, "coverag": 3, "improv": 3, "appreci": 3, "within": [3, 4, 5, 7, 8], "restructur": 3, "text": [3, 6, 8], "rst": 3, "similar": [3, 8], "markdown": 3, "These": [3, 7, 10], "built": 3, "html": 3, "sphinx": 3, "www": 3, "en": 3, "packag": [3, 9, 10], "m": [3, 10], "r": [3, 10], "txt": 3, "feel": 3, "good": [3, 6], "about": [3, 4, 5], "statu": 3, "messag": [3, 7], "when": [3, 6, 7, 8, 10], "appear": [3, 7], "publicli": 3, "push": 3, "default": [3, 7, 8, 10], "given": [3, 4, 6, 7, 8], "v": 3, "banner": 3, "demonstr": [4, 10], "librari": [4, 5, 7, 9], "parallel": [4, 5, 6, 9], "friendli": [4, 5], "motiv": 4, "attempt": [4, 7], "alreadi": [4, 7, 10], "direct": [4, 5, 6], "whether": [4, 6, 7], "over": [4, 5], "nevertheless": 4, "abil": [4, 6], "arrang": [4, 5], "chunk": [4, 5, 7, 8, 9, 10], "concurr": [4, 5], "index": [4, 5, 6, 7, 8, 10], "power": [4, 5], "action": [4, 7], "thread": 4, "independ": 4, "overheard": 4, "system": [4, 5, 6, 7, 10], "account": 4, "speedup": 4, "equal": [4, 7, 8], "core": 4, "leverag": 4, "contrast": 4, "mutipl": 4, "wait": 4, "extern": 4, "latenc": [4, 5, 10], "compar": 4, "domin": [4, 8], "total": [4, 7, 10], "request": [4, 5, 7], "launch": 4, "pai": 4, "overhead": 4, "cost": [4, 5, 6], "togeth": [4, 7], "both": [4, 6, 7, 8, 10], "simultan": 4, "numer": 4, "modern": [4, 6], "int32": 4, "valu": [4, 6, 7, 8, 10], "n": [4, 8, 10], "element": [4, 8], "nx4": 4, "remain": [4, 7], "dimens": [4, 5, 7, 8, 10], "cannot": [4, 8], "represent": 4, "perhap": 4, "through": [4, 7, 9, 10], "doe": [4, 5, 8, 9], "almost": 4, "area": 4, "describ": [4, 5, 8], "relat": 4, "anoth": [4, 10], "attribut": [4, 5, 7, 8, 10], "associ": [4, 8], "backend": [4, 5], "interfac": [4, 8], "could": [4, 7, 8, 9, 10], "ingest": 4, "anywher": [4, 9], "idea": 4, "arbitrari": [4, 5, 8, 10], "map": [4, 7, 10], "pathnam": 4, "static": 4, "whole": [4, 6, 7, 8], "differ": [4, 5, 7, 10], "thu": [4, 10], "assign": [4, 6, 10], "posit": 4, "tree": [4, 7], "two": [4, 6, 7, 8, 10], "small": [4, 8], "special": [4, 7, 10], "zarrai": [4, 8], "zattr": [4, 8], "give": [4, 7], "inform": [4, 5, 7, 8, 9, 10], "shape": [4, 7, 8], "themselv": 4, "expect": [4, 7, 8], "live": [4, 7, 9], "var": [4, 7, 10], "x": [4, 8], "y": [4, 10], "across": [4, 5, 7], "construct": [4, 6, 9], "exact": [4, 7], "filenam": [4, 7], "unifi": 5, "repres": [5, 8], "varieti": [5, 10], "netcdf": [5, 7], "tradit": 5, "flexibl": 5, "potenti": 5, "situ": 5, "without": [5, 6, 10], "copi": [5, 6, 7], "gatewai": 5, "massiv": 5, "while": 5, "still": [5, 6, 10], "insist": 5, "legaci": 5, "archiv": [5, 6, 7], "why": 5, "thing": 5, "serverless": 5, "architectur": 5, "plu": 5, "physic": 5, "gc": 5, "abf": 5, "alibaba": 5, "dropbox": 5, "gdrive": 5, "protocol": [5, 7, 8], "ftp": 5, "ssh": 5, "smb": 5, "variou": 5, "heterogen": 5, "driver": [5, 7], "h5py": [5, 9], "asynchron": 5, "amort": 5, "lock": 5, "logic": [5, 6, 7], "million": 5, "subselect": 5, "scientif": 5, "datatyp": 5, "amount": 5, "observ": 5, "simul": 5, "handl": 5, "download": [5, 6], "era": 5, "answer": 5, "move": [5, 6], "collect": 5, "premis": 5, "hard": [5, 6], "ineffici": 5, "seamlessli": 5, "essenti": 5, "involv": [5, 9], "buffer": [5, 6], "disc": 5, "descript": [5, 6, 7, 8], "wide": 5, "primari": [5, 7], "purpos": 5, "find": [5, 8, 10], "pleas": 5, "Or": 5, "consid": [5, 8, 10], "pydata": 5, "talk": 5, "quick": 5, "tutori": 5, "reader": [5, 7], "studi": 5, "sentinel": 5, "global": [5, 10], "coher": 5, "solar": 5, "dynam": 5, "observatori": 5, "nation": 5, "water": 5, "model": 5, "mur": 5, "sst": 5, "hrrr": 5, "parquet": [5, 7], "beyond": 5, "non": [5, 7, 8], "zstd": [5, 8, 10], "csv": 5, "orc": 5, "feather": 5, "api": [5, 6, 10], "contribut": 5, "bug": 5, "report": 5, "featur": [5, 6, 8], "chang": [5, 7, 10], "advanc": 5, "topic": 5, "modul": [5, 10], "search": [5, 6, 10], "major": 6, "focus": 6, "applic": 6, "structur": [6, 7, 8], "piec": 6, "yet": [6, 7], "develop": [6, 10], "concaten": [6, 7, 10], "space": 6, "transfer": 6, "upon": 6, "happen": [6, 7], "unpack": 6, "simpl": [6, 7, 10], "enough": [6, 8], "wise": 6, "best": 6, "world": 6, "achiev": [6, 10], "sore": 6, "subsequ": 6, "row": [6, 8], "newlin": 6, "charact": [6, 10], "random": 6, "record": [6, 7], "quot": 6, "know": [6, 10], "inde": 6, "misidentif": 6, "failur": 6, "just": [6, 7, 8, 10], "determin": 6, "safe": 6, "embed": [6, 7], "tabular": [6, 7], "sometim": 6, "low": 6, "cardin": 6, "easier": 6, "exclud": 6, "unneed": 6, "month": [6, 10], "001": 6, "column": [6, 7], "wish": [6, 7], "smart": 6, "around": 6, "raw": [6, 7, 8], "serialis": [6, 7], "arrow": 6, "pyarrow": 6, "h5f": [7, 10], "binaryio": 7, "str": [7, 8], "none": 7, "spec": [7, 8], "500": 7, "storage_opt": [7, 9, 10], "error": 7, "warn": 7, "vlen_encod": 7, "emb": 7, "group": [7, 10], "duck": 7, "adher": 7, "uri": 7, "produc": [7, 8], "readm": 7, "zero": [7, 8], "neg": 7, "disabl": 7, "pdb": 7, "ignor": 7, "rais": 7, "null": [7, 8], "leav": 7, "vlen": 7, "16byte": 7, "garbag": 7, "id": [7, 10], "unaffect": 7, "empti": [7, 8], "tabl": 7, "few": 7, "suppli": 7, "fill": [7, 10], "customis": 7, "method": [7, 10], "entri": [7, 8], "No": 7, "scan_grib": 7, "skip": 7, "locat": [7, 8, 10], "common_var": 7, "depr": 7, "keyword": 7, "cf": [7, 10], "typeoflevel": 7, "level": [7, 8], "process_fil": 7, "extens": 7, "primary_attr_to_group": 7, "ordin": 7, "integ": [7, 8], "bool": 7, "tiff_to_zarr": 7, "urlpath": 7, "target_opt": [7, 9, 10], "writer": 7, "max_chunk_s": 7, "kwarg": 7, "scipi": 7, "behav": 7, "valid": [7, 10], "netcdf2": 7, "test": 7, "__init__": 7, "below": [7, 9, 10], "big": [7, 8, 10], "trigger": 7, "subchunk": 7, "never": 7, "000byte": 7, "6000": 7, "split": [7, 8, 9, 10], "biggest": 7, "tbc": 7, "arg": [7, 10], "superclass": 7, "io": 7, "netcdf_fil": 7, "gribcodec": 7, "dtype": [7, 8, 10], "grib": 7, "stream": 7, "eccod": 7, "asciitablecodec": 7, "indtyp": 7, "outdtyp": 7, "ascii": [7, 8], "field": [7, 8], "fillstringscodec": 7, "id_map": 7, "fix": 7, "length": [7, 8], "valul": 7, "opaqu": 7, "16": [7, 10], "vararrcodec": 7, "dt_in": 7, "dt_out": 7, "nrow": 7, "bintabl": 7, "recordarraymemb": 7, "compon": 7, "complex": [7, 10], "desir": 7, "subarrai": 7, "overal": 7, "parsabl": 7, "np": [7, 10], "indict": 7, "coo_map": 7, "coo_dtyp": [7, 10], "identical_dim": [7, 10], "assicu": 7, "expand": [7, 8], "selector": 7, "varnam": 7, "look": 7, "constant": 7, "compil": [7, 10], "regex": [7, 10], "pattern": [7, 10], "exactli": 7, "begin": 7, "attr": [7, 10], "vattr": [7, 10], "far": 7, "appropri": 7, "cftime": 7, "datetim": [7, 10], "automat": 7, "unless": 7, "specifi": [7, 10], "m8": [7, 10], "convers": 7, "beforehand": 7, "signatur": 7, "fn": [7, 10], "counter": 7, "probe": 7, "coerc": 7, "otherwis": [7, 8], "numpi": 7, "vari": [7, 10], "callabl": 7, "act": 7, "drop": [7, 10], "append": [7, 9], "rather": [7, 8, 10], "scratch": 7, "assum": [7, 8], "classmethod": 7, "original_ref": 7, "There": [7, 10], "usag": [7, 10], "amend": 7, "extend": 7, "creation": 7, "merge_var": [7, 10], "ident": [7, 10], "openfil": 7, "open_fil": 7, "concatenate_arrai": 7, "key_seper": 7, "check_arrai": 7, "along": [7, 9, 10], "concatent": 7, "recombin": 7, "single_zarr": 7, "hierarchi": 7, "recreat": 7, "check": 7, "inconsist": 7, "except": [7, 8], "certain": 7, "compat": [7, 8], "single_driv": 7, "single_kwarg": 7, "mzz_kwarg": 7, "n_batch": 7, "output_opt": 7, "run": [7, 9, 10], "client": 7, "mzz": [7, 9, 10], "multpl": 7, "thereof": 7, "ouput": 7, "preprocessor": 7, "remov": 7, "rename_target": 7, "renam": [7, 10], "predict": 7, "templat": [7, 8], "easili": 7, "overrid": 7, "rewrit": 7, "everi": 7, "old": 7, "alter": 7, "rename_target_fil": 7, "url_in": 7, "url_out": 7, "renate_target": 7, "overwrit": 7, "generate_coord": 7, "tag": 7, "probabl": 7, "tupl": 7, "order": 7, "factor": 7, "largest": 7, "deep": [7, 8], "divisor": 7, "modifi": 7, "threshold": [7, 10], "replac": [7, 9, 10], "short": [7, 10], "base64": [7, 8], "larger": [7, 10], "inline_arrai": 7, "fewer": 7, "constitu": 7, "prevent": 7, "dot": 7, "irrespect": 7, "fo": [7, 9, 10], "100000": 7, "categorical_threshold": 7, "mimic": 7, "normal": [7, 8], "datafram": 7, "writabl": 7, "deriv": 7, "extra": 7, "10000": [7, 8], "bigger": 7, "categor": [7, 8], "ratio": 7, "greater": 7, "kind": 8, "prototyp": 8, "key0": 8, "key1": 8, "target_url": 8, "item": [8, 10], "propos": 8, "zgroup": 8, "zarr_format": 8, "convent": [8, 10], "ugrid": 8, "9": [8, 10], "_array_dimens": 8, "node": 8, "9228245": 8, "f8": 8, "bucket": 8, "294094376": 8, "73825960": 8, "gen": 8, "ought": 8, "previou": 8, "definit": 8, "enhanc": 8, "template_nam": 8, "jinja": 8, "variable_nam": 8, "OR": 8, "key_nam": 8, "render": 8, "jinja2": 8, "equival": 8, "liter": [8, 10], "annot": 8, "server": 8, "domain": 8, "gen_kei": 8, "5": [8, 10], "key2": 8, "key3": 8, "cartesian": 8, "evalu": 8, "gen_key0": 8, "path_0": 8, "gen_key1": 8, "path_1": 8, "2000": [8, 10], "gen_key2": 8, "path_2": 8, "3000": 8, "gen_key3": 8, "path_3": 8, "4000": 8, "gen_key4": 8, "path_4": 8, "5000": 8, "verbos": 8, "easi": 8, "heavi": 8, "former": 8, "allevi": 8, "latter": 8, "particularli": 8, "appar": 8, "dure": 8, "phase": [8, 10], "_implementation_": 8, "layout": 8, "mapper": [8, 9], "figur": 8, "lz": 8, "z": 8, "open_group": 8, "w": 8, "create_dataset": 8, "g2": 8, "create_group": 8, "zmetadata": 8, "subgroup": 8, "zeroth": 8, "nth": 8, "pad": 8, "realli": 8, "popul": [8, 10], "reserv": 8, "mostli": 8, "emsembl": 9, "defin": [9, 10], "urllist": 9, "simpli": 9, "redo": 9, "p": 9, "nwm": 9, "retro": 9, "v2": 9, "pd": [9, 10], "full_phys": 9, "2017": 9, "201704010000": 9, "chrtout_domain1": 9, "comp": 9, "201704010100": 9, "201704010200": 9, "201704010300": 9, "201704010400": 9, "201704010500": 9, "201704010600": 9, "201704010700": 9, "201704010800": 9, "201704010900": 9, "default_fill_cach": [9, 10], "default_cache_typ": [9, 10], "inf": 9, "h5chunk": [9, 10], "ensembl": [9, 10], "again": 9, "crucial": 9, "restrict": 9, "person": 9, "initi": [9, 10], "analysi": 9, "veloc": 9, "invoc": 9, "littl": [9, 10], "declar": 9, "catalog": [9, 10], "line": 9, "previous": [9, 10], "earth": [9, 10], "scienc": 9, "partner": 9, "2022": [9, 10], "winter": 9, "meet": 9, "free": [9, 10], "environ": 9, "lsterzing": 9, "esip": [9, 10], "intend": 10, "displai": 10, "pair": 10, "era5": 10, "replic": 10, "public": 10, "aw": 10, "frequent": 10, "manual": 10, "upload": 10, "destin": 10, "flist": 10, "2020": 10, "air_pressure_at_mean_sea_level": 10, "sea_surface_temperatur": 10, "fs2": 10, "pathlib": 10, "rb": 10, "avoid": 10, "lower": 10, "gen_json": 10, "file_url": 10, "infil": 10, "300": 10, "adjust": 10, "higher": 10, "outf": 10, "wb": 10, "dramat": 10, "30": 10, "sy": 10, "74": 10, "35": 10, "wall": 10, "14min": 10, "44": 10, "anonym": 10, "01_air_pressure_at_mean_sea_level": 10, "print": 10, "time0": 10, "744": 10, "lat": 10, "721": 10, "lon": 10, "1440": 10, "float32": 10, "90": 10, "89": 10, "75": 10, "nan": 10, "25": 10, "359": 10, "datetime64": 10, "01": 10, "202": 10, "chunksiz": 10, "24": 10, "meta": 10, "ndarrai": 10, "institut": 10, "ecmwf": 10, "reanalysi": 10, "titl": 10, "forecast": 10, "162": 10, "17": 10, "180": 10, "235": 10, "concat": 10, "json_list": 10, "_air_pressure_at_mean_sea_level": 10, "air_pressure_at_mean_sea_level_combin": 10, "reveal": 10, "span": 10, "backend_arg": 10, "42": 10, "128": 10, "\u00b5": 10, "40": 10, "new_dim": 10, "new_dimens": 10, "ex": 10, "_air": 10, "custom": 10, "regular": 10, "express": 10, "cgl_toc_yyyymmddhhmm_x21y05_s3a_v1": 10, "fn_to_tim": 10, "subst": 10, "12": 10, "strptime": 10, "h": 10, "sort": 10, "iglob": 10, "3360": 10, "float64": 10, "34": 10, "99": 10, "39": 10, "11t07": 10, "00": 10, "53": 10, "ac_process_flag": 10, "168": 10, "oa02_toc": 10, "vza_olci": 10, "6": 10, "archive_facil": 10, "vito": 10, "copyright": 10, "copernicu": 10, "servic": 10, "similarli": 10, "02": 10, "29t23": 10, "shortnameecmwf": 10, "msl": 10, "01_sea_surface_temperatur": 10, "pre": 10, "pre_process": 10, "k": 10, "startswith": 10, "pop": 10, "vars_combin": 10, "02_sea_surface_temperatur": 10, "sea_surface_temperature_combin": 10, "696": 10, "29": 10, "post": 10, "fill_valu": 10, "manipul": 10, "modify_fill_valu": 10, "out_": 10, "999": 10, "referenc": 10, "43": 10, "year": 10, "sidecar": 10, "8gb": 10, "194mb": 10, "7gb": 10, "qhub": 10, "era5_2020_2022_multivar": 10, "zst": 10, "era5_1979_2022_multivar": 10, "380568": 10, "1979": 10, "air_temperature_at_2_metr": 10, "dew_point_temperature_at_2_metr": 10, "eastward_wind_at_100_metr": 10, "eastward_wind_at_10_metr": 10, "lwe_thickness_of_surface_snow_amount": 10, "northward_wind_at_100_metr": 10, "48": 10, "61": 10, "54": 10, "1min": 10, "reason": 10, "suggest": 10, "hide": 10, "open_catalog": 10, "intake_catalog": 10, "to_dask": 10, "shop": 10, "da": 10, "sel": 10, "2021": 10, "01t00": 10, "plot": 10, "79": 10, "382": 10, "18": 10, "22": 10, "198": 10, "slice": 10, "31": 10, "92": 10, "663": 10}, "objects": {"kerchunk.codecs": [[7, 0, 1, "", "AsciiTableCodec"], [7, 0, 1, "", "FillStringsCodec"], [7, 0, 1, "", "GRIBCodec"], [7, 0, 1, "", "RecordArrayMember"], [7, 0, 1, "", "VarArrCodec"]], "kerchunk.codecs.AsciiTableCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.FillStringsCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.GRIBCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.RecordArrayMember": [[7, 1, 1, "", "__init__"]], "kerchunk.codecs.VarArrCodec": [[7, 1, 1, "", "__init__"]], "kerchunk.combine": [[7, 0, 1, "", "MultiZarrToZarr"], [7, 2, 1, "", "auto_dask"], [7, 2, 1, "", "concatenate_arrays"], [7, 2, 1, "", "drop"], [7, 2, 1, "", "merge_vars"]], "kerchunk.combine.MultiZarrToZarr": [[7, 1, 1, "", "__init__"], [7, 1, 1, "", "append"], [7, 1, 1, "", "translate"]], "kerchunk.df": [[7, 2, 1, "", "refs_to_dataframe"]], "kerchunk.fits": [[7, 2, 1, "", "process_file"]], "kerchunk.grib2": [[7, 2, 1, "", "scan_grib"]], "kerchunk.hdf": [[7, 0, 1, "", "SingleHdf5ToZarr"]], "kerchunk.hdf.SingleHdf5ToZarr": [[7, 1, 1, "", "translate"]], "kerchunk.netCDF3": [[7, 0, 1, "", "NetCDF3ToZarr"]], "kerchunk.netCDF3.NetCDF3ToZarr": [[7, 1, 1, "", "__init__"], [7, 1, 1, "", "translate"]], "kerchunk.tiff": [[7, 2, 1, "", "generate_coords"], [7, 2, 1, "", "tiff_to_zarr"]], "kerchunk.utils": [[7, 2, 1, "", "consolidate"], [7, 2, 1, "", "dereference_archives"], [7, 2, 1, "", "do_inline"], [7, 2, 1, "", "inline_array"], [7, 2, 1, "", "rename_target"], [7, 2, 1, "", "rename_target_files"], [7, 2, 1, "", "subchunk"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"]}, "titleterms": {"advanc": 0, "topic": 0, "us": [0, 6, 9, 10], "dask": 0, "simpl": 0, "parallel": 0, "tree": 0, "reduct": 0, "archiv": 0, "file": [0, 7, 9, 10], "parquet": [0, 6, 8], "storag": 0, "beyond": 1, "python": [1, 3], "case": 2, "studi": 2, "sentinel": 2, "global": 2, "coher": 2, "solar": 2, "dynam": 2, "observatori": 2, "nation": 2, "water": 2, "model": 2, "mur": 2, "sst": 2, "hrrr": 2, "contribut": 3, "kerchunk": [3, 5, 10], "bug": 3, "report": 3, "featur": 3, "request": 3, "code": 3, "creat": 3, "fork": 3, "develop": 3, "environ": 3, "set": 3, "up": 3, "pre": 3, "commit": 3, "branch": 3, "run": 3, "test": 3, "suit": 3, "document": 3, "chang": 3, "detail": 4, "descript": 4, "binari": 4, "buffer": 4, "fsspec": 4, "virtual": 4, "filesystem": 4, "zarr": [4, 6], "reader": 4, "introduct": 5, "content": 5, "indic": 5, "tabl": 5, "non": 6, "tar": 6, "zstd": 6, "csv": 6, "json": [6, 9, 10], "orc": 6, "feather": 6, "api": 7, "refer": [7, 8], "format": 7, "backend": 7, "codec": 7, "combin": [7, 10], "util": 7, "specif": 8, "version": 8, "0": 8, "1": 8, "quick": 9, "start": 9, "singl": [9, 10], "multi": 9, "output": [9, 10], "exampl": 9, "tutori": [9, 10], "notebook": 9, "multipl": 10, "dataset": 10, "logic": 10, "aggreg": 10, "coo_map": 10, "merg": 10, "variabl": 10, "across": 10, "preprocess": 10, "postprocess": 10}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 60}, "alltitles": {"Advanced Topics": [[0, "advanced-topics"]], "Using Dask": [[0, "using-dask"]], "Simple parallel": [[0, "simple-parallel"]], "Tree reduction": [[0, "tree-reduction"]], "Archive Files": [[0, "archive-files"]], "Parquet Storage": [[0, "parquet-storage"]], "Beyond Python": [[1, "beyond-python"]], "Case studies": [[2, "case-studies"]], "Sentinel Global coherence": [[2, "sentinel-global-coherence"]], "Solar Dynamics Observatory": [[2, "solar-dynamics-observatory"]], "National Water Model": [[2, "national-water-model"]], "MUR SST": [[2, "mur-sst"]], "HRRR": [[2, "hrrr"]], "Contributing to kerchunk": [[3, "contributing-to-kerchunk"]], "Bug reports and feature requests": [[3, "bug-reports-and-feature-requests"]], "Contributing code": [[3, "contributing-code"]], "Creating a Fork": [[3, "creating-a-fork"]], "Creating a development environment": [[3, "creating-a-development-environment"]], "Creating a Python Environment": [[3, "creating-a-python-environment"]], "Setting up pre-commit": [[3, "setting-up-pre-commit"]], "Creating a branch": [[3, "creating-a-branch"]], "Running the test suite": [[3, "running-the-test-suite"]], "Contributing documentation": [[3, "contributing-documentation"]], "Contributing changes": [[3, "contributing-changes"]], "Detailed description": [[4, "detailed-description"]], "Binary buffers": [[4, "binary-buffers"]], "fsspec virtual filesystem": [[4, "fsspec-virtual-filesystem"]], "zarr reader": [[4, "zarr-reader"]], "kerchunk": [[5, "kerchunk"]], "Introduction": [[5, "introduction"]], "Contents:": [[5, null]], "Indices and tables": [[5, "indices-and-tables"]], "Non-zarr uses": [[6, "non-zarr-uses"]], ".tar.zstd": [[6, "tar-zstd"]], ".csv/.json": [[6, "csv-json"]], "parquet/orc/feather": [[6, "parquet-orc-feather"]], "API Reference": [[7, "api-reference"]], "File format backends": [[7, "file-format-backends"]], "Codecs": [[7, "codecs"]], "Combining": [[7, "combining"]], "Utilities": [[7, "utilities"]], "References specification": [[8, "references-specification"]], "Version 0": [[8, "version-0"]], "Version 1": [[8, "version-1"]], "Parquet references": [[8, "parquet-references"]], "Quick Start": [[9, "quick-start"]], "Single file JSONs": [[9, "single-file-jsons"], [10, "single-file-jsons"]], "Multi-file JSONs": [[9, "multi-file-jsons"]], "Using the output": [[9, "using-the-output"], [10, "using-the-output"]], "Example/Tutorial Notebook": [[9, "example-tutorial-notebook"]], "Tutorial": [[10, "tutorial"]], "Combine multiple kerchunked datasets into a single logical aggregate dataset": [[10, "combine-multiple-kerchunked-datasets-into-a-single-logical-aggregate-dataset"]], "Using coo_map": [[10, "using-coo-map"]], "Merging variables across jsons": [[10, "merging-variables-across-jsons"]], "Preprocessing": [[10, "preprocessing"]], "Postprocessing": [[10, "postprocessing"]]}, "indexentries": {"asciitablecodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.AsciiTableCodec"]], "fillstringscodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.FillStringsCodec"]], "gribcodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.GRIBCodec"]], "multizarrtozarr (class in kerchunk.combine)": [[7, "kerchunk.combine.MultiZarrToZarr"]], "netcdf3tozarr (class in kerchunk.netcdf3)": [[7, "kerchunk.netCDF3.NetCDF3ToZarr"]], "recordarraymember (class in kerchunk.codecs)": [[7, "kerchunk.codecs.RecordArrayMember"]], "singlehdf5tozarr (class in kerchunk.hdf)": [[7, "kerchunk.hdf.SingleHdf5ToZarr"]], "vararrcodec (class in kerchunk.codecs)": [[7, "kerchunk.codecs.VarArrCodec"]], "__init__() (kerchunk.codecs.asciitablecodec method)": [[7, "kerchunk.codecs.AsciiTableCodec.__init__"]], "__init__() (kerchunk.codecs.fillstringscodec method)": [[7, "kerchunk.codecs.FillStringsCodec.__init__"]], "__init__() (kerchunk.codecs.gribcodec method)": [[7, "kerchunk.codecs.GRIBCodec.__init__"]], "__init__() (kerchunk.codecs.recordarraymember method)": [[7, "kerchunk.codecs.RecordArrayMember.__init__"]], "__init__() (kerchunk.codecs.vararrcodec method)": [[7, "kerchunk.codecs.VarArrCodec.__init__"]], "__init__() (kerchunk.combine.multizarrtozarr method)": [[7, "kerchunk.combine.MultiZarrToZarr.__init__"]], "__init__() (kerchunk.netcdf3.netcdf3tozarr method)": [[7, "kerchunk.netCDF3.NetCDF3ToZarr.__init__"]], "append() (kerchunk.combine.multizarrtozarr class method)": [[7, "kerchunk.combine.MultiZarrToZarr.append"]], "auto_dask() (in module kerchunk.combine)": [[7, "kerchunk.combine.auto_dask"]], "concatenate_arrays() (in module kerchunk.combine)": [[7, "kerchunk.combine.concatenate_arrays"]], "consolidate() (in module kerchunk.utils)": [[7, "kerchunk.utils.consolidate"]], "dereference_archives() (in module kerchunk.utils)": [[7, "kerchunk.utils.dereference_archives"]], "do_inline() (in module kerchunk.utils)": [[7, "kerchunk.utils.do_inline"]], "drop() (in module kerchunk.combine)": [[7, "kerchunk.combine.drop"]], "generate_coords() (in module kerchunk.tiff)": [[7, "kerchunk.tiff.generate_coords"]], "inline_array() (in module kerchunk.utils)": [[7, "kerchunk.utils.inline_array"]], "merge_vars() (in module kerchunk.combine)": [[7, "kerchunk.combine.merge_vars"]], "process_file() (in module kerchunk.fits)": [[7, "kerchunk.fits.process_file"]], "refs_to_dataframe() (in module kerchunk.df)": [[7, "kerchunk.df.refs_to_dataframe"]], "rename_target() (in module kerchunk.utils)": [[7, "kerchunk.utils.rename_target"]], "rename_target_files() (in module kerchunk.utils)": [[7, "kerchunk.utils.rename_target_files"]], "scan_grib() (in module kerchunk.grib2)": [[7, "kerchunk.grib2.scan_grib"]], "subchunk() (in module kerchunk.utils)": [[7, "kerchunk.utils.subchunk"]], "tiff_to_zarr() (in module kerchunk.tiff)": [[7, "kerchunk.tiff.tiff_to_zarr"]], "translate() (kerchunk.combine.multizarrtozarr method)": [[7, "kerchunk.combine.MultiZarrToZarr.translate"]], "translate() (kerchunk.hdf.singlehdf5tozarr method)": [[7, "kerchunk.hdf.SingleHdf5ToZarr.translate"]], "translate() (kerchunk.netcdf3.netcdf3tozarr method)": [[7, "kerchunk.netCDF3.NetCDF3ToZarr.translate"]]}}) \ No newline at end of file diff --git a/spec.html b/spec.html index 880180de..9493141f 100644 --- a/spec.html +++ b/spec.html @@ -57,6 +57,7 @@
  • References specification
  • Beyond Python
  • @@ -214,6 +215,58 @@

    Version 1 +
    +

    Parquet references

    +

    Since JSON is rather verbose, it is easy with enough chunks to make a references file +that is too big: slow to load and heavy on memory. Although the former can be +alleviated by compression (I recommend Zstd), the latter cannot. This can +become particularly apparent during the combine phase when loading many reference sets.

    +

    The class fsspec.implementations.reference.LazyReferenceMapper provides an +alternative _implementation_, and its on-disk layout effectively is a new reference +spec, and we describe it here. The class itself has a dict mapper interface, just +like the rendered references from JSON files; except that it assumes that it is +working on a zarr dataset. This is because the references are split into files, and +an array’s shape/chunk information is used to figure out which reference file +to load.

    +

    The following code

    +
    lz = fsspec.implementations.reference.LazyReferenceMapper.create("ref.parquet")
    +z = zarr.open_group(lz, mode="w")
    +d = z.create_dataset("name", shape=(1,))
    +d[:] = 1
    +g2 = z.create_group("deep")
    +d = g2.create_dataset("name", shape=(1,))
    +d[:] = 1
    +
    +
    +

    produces files

    +
    ref.parquet/deep/name/refs.0.parq
    +ref.parquet/name/refs.0.parq
    +ref.parquet/.zmetadata
    +
    +
    +

    Here, .zmetadata is all of the metadata of all of all subgroups/arrays (similar to +zarr “consolidated metadata”, with two top-level fields: “metadata” (dict[str, str] all of the +zarr metadata key/values) and “record_size”, an integer set during .create().

    +

    Each parquet file contains references within the corresponding path to where it is. +For example, key “name/0” will be the zeroth reference in “./name/refs.0.parq”. If +there are multiple dimensions, normal C indexing is used to find the Nth reference, +and there are up to “record_size” references (default 10000) in the first file; +reference >10000,<=2000 would be in “./name/refs.2.parquet”. Each file is (for now) +padded to record_size, but they compress really well.

    +

    Each row of the parquet data contains fields

    +
    path: optional str/categorical, remote location URL
    +offset: int, start location of block
    +size: int, number of bytes in block
    +raw: optional bytes, binary data
    +
    +
    +

    If raw is populated, this is the data of the key. If path is +populated but size is 0, it is the whole file indicated. Otherwise, +it is a byte block in the indicated file. If both raw and path +are NULL, the key does not exist.

    +

    We reserve the possibility to store small array data in .zmetadata instead +of creating a small/mostly empty parquet file for each.

    +