fsspec · mpiannucci · Oct 4, 2024 · Oct 5, 2024 · Oct 5, 2024 · Oct 8, 2024
diff --git a/kerchunk/codecs.py b/kerchunk/codecs.py
@@ -1,11 +1,18 @@
 import ast
+from dataclasses import dataclass
 import io
+from typing import Self, TYPE_CHECKING
 
 import numcodecs
 from numcodecs.abc import Codec
 import numpy as np
 import threading
 import zlib
+from zarr.core.array_spec import ArraySpec
+from zarr.abc.codec import ArrayBytesCodec
+from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer
+from zarr.core.common import JSON, parse_enum, parse_named_configuration
+from zarr.registry import register_codec
 
 
 class FillStringsCodec(Codec):
@@ -115,6 +122,78 @@ def decode(self, buf, out=None):
 numcodecs.register_codec(GRIBCodec, "grib")
 
 
+@dataclass(frozen=True)
+class GRIBZarrCodec(ArrayBytesCodec):
+    eclock = threading.RLock()
+
+    var: str
+    dtype: np.dtype
+
+    def __init__(self, *, var: str, dtype: np.dtype) -> None:
+        object.__setattr__(self, "var", var)
+        object.__setattr__(self, "dtype", dtype)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(
+            data, "bytes", require_configuration=True
+        )
+        configuration_parsed = configuration_parsed or {}
+        return cls(**configuration_parsed)  # type: ignore[arg-type]
+
+    def to_dict(self) -> dict[str, JSON]:
+        if self.endian is None:
+            return {"name": "grib"}
+        else:
+            return {
+                "name": "grib",
+                "configuration": {"var": self.var, "dtype": self.dtype},
+            }
+
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        assert isinstance(chunk_bytes, Buffer)
+        import eccodes
+
+        if self.var in ["latitude", "longitude"]:
+            var = self.var + "s"
+            dt = self.dtype or "float64"
+        else:
+            var = "values"
+            dt = self.dtype or "float32"
+
+        with self.eclock:
+            mid = eccodes.codes_new_from_message(chunk_bytes.to_bytes())
+            try:
+                data = eccodes.codes_get_array(mid, var)
+                missingValue = eccodes.codes_get_string(mid, "missingValue")
+                if var == "values" and missingValue:
+                    data[data == float(missingValue)] = np.nan
+                return data.astype(dt, copy=False)
+
+            finally:
+                eccodes.codes_release(mid)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        # This is a one way codec
+        raise NotImplementedError
+
+    def compute_encoded_size(
+        self, input_byte_length: int, _chunk_spec: ArraySpec
+    ) -> int:
+        raise NotImplementedError
+
+
+register_codec("grib", GRIBZarrCodec)
+
+
 class AsciiTableCodec(numcodecs.abc.Codec):
     """Decodes ASCII-TABLE extensions in FITS files"""
 
@@ -166,7 +245,6 @@ def decode(self, buf, out=None):
         arr2 = np.empty((self.nrow,), dtype=dt_out)
         heap = buf[arr.nbytes :]
         for name in dt_out.names:
-
             if dt_out[name] == "O":
                 dt = np.dtype(self.ftypes[self.types[name]])
                 counts = arr[name][:, 0]
@@ -244,8 +322,7 @@ def encode(self, buf):
 class ZlibCodec(Codec):
     codec_id = "zlib"
 
-    def __init__(self):
-        ...
+    def __init__(self): ...
 
     def decode(self, data, out=None):
         if out:

diff --git a/kerchunk/combine.py b/kerchunk/combine.py
@@ -1,3 +1,4 @@
+import asyncio
 import collections.abc
 import logging
 import re
@@ -10,8 +11,9 @@
 import numcodecs
 import ujson
 import zarr
+from zarr.core.buffer.core import default_buffer_prototype
 
-from kerchunk.utils import consolidate
+from kerchunk.utils import consolidate, fs_as_store, translate_refs_serializable
 
 logger = logging.getLogger("kerchunk.combine")
 
@@ -199,11 +201,12 @@ def append(
             remote_protocol=remote_protocol,
             remote_options=remote_options,
             target_options=target_options,
+            asynchronous=True
         )
         ds = xr.open_dataset(
             fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False}
         )
-        z = zarr.open(fs.get_mapper())
+        z = zarr.open(fs.get_mapper(), zarr_format=2)
         mzz = MultiZarrToZarr(
             path,
             out=fs.references,  # dict or parquet/lazy
@@ -264,7 +267,7 @@ def fss(self):
                 self._paths = []
                 for of in fsspec.open_files(self.path, **self.target_options):
                     self._paths.append(of.full_name)
-                fs = fsspec.core.url_to_fs(self.path[0], **self.target_options)[0]
+                fs = fsspec.core.url_to_fs(self.path[0], asynchronous=True, **self.target_options)[0]
                 try:
                     # JSON path
                     fo_list = fs.cat(self.path)
@@ -348,6 +351,16 @@ def _get_value(self, index, z, var, fn=None):
         logger.debug("Decode: %s -> %s", (selector, index, var, fn), o)
         return o
 
+    async def _read_meta_files(self, m, files):
+        """Helper to load multiple metadata files asynchronously"""
+        res = {}
+        for fn in files:
+            exists = await m.exists(fn)
+            if exists:
+                content = await m.get(fn, prototype=default_buffer_prototype())
+                res[fn] = ujson.dumps(ujson.loads(content.to_bytes()))
+        return res
+
     def first_pass(self):
         """Accumulate the set of concat coords values across all inputs"""
 
@@ -360,7 +373,8 @@ def first_pass(self):
                 fs._dircache_from_items()
 
             logger.debug("First pass: %s", i)
-            z = zarr.open_group(fs.get_mapper(""))
+            z_store = fs_as_store(fs, read_only=False)
+            z = zarr.open_group(z_store, zarr_format=2)
             for var in self.concat_dims:
                 value = self._get_value(i, z, var, fn=self._paths[i])
                 if isinstance(value, np.ndarray):
@@ -386,16 +400,16 @@ def store_coords(self):
         Write coordinate arrays into the output
         """
         kv = {}
-        store = zarr.storage.KVStore(kv)
-        group = zarr.open(store)
-        m = self.fss[0].get_mapper("")
-        z = zarr.open(m)
+        store = zarr.storage.MemoryStore(kv)
+        group = zarr.open_group(store, zarr_format=2)
+        m = fs_as_store(self.fss[0], read_only=False)
+        z = zarr.open(m, zarr_format=2)
         for k, v in self.coos.items():
             if k == "var":
                 # The names of the variables to write in the second pass, not a coordinate
                 continue
             # parametrize the threshold value below?
-            compression = numcodecs.Zstd() if len(v) > 100 else None
+            compressor = numcodecs.Zstd() if len(v) > 100 else None
             kw = {}
             if self.cf_units and k in self.cf_units:
                 if "M" not in self.coo_dtypes.get(k, ""):
@@ -420,11 +434,12 @@ def store_coords(self):
                 elif k in z:
                     # Fall back to existing fill value
                     kw["fill_value"] = z[k].fill_value
-            arr = group.create_dataset(
+            arr = group.create_array(
                 name=k,
                 data=data,
-                overwrite=True,
-                compressor=compression,
+                shape=data.shape,
+                exists_ok=True,
+                compressor=compressor,
                 dtype=self.coo_dtypes.get(k, data.dtype),
                 **kw,
             )
@@ -441,10 +456,9 @@ def store_coords(self):
             # TODO: rewrite .zarray/.zattrs with ujson to save space. Maybe make them by hand anyway.
         self.out.update(kv)
         logger.debug("Written coordinates")
-        for fn in [".zgroup", ".zattrs"]:
-            # top-level group attributes from first input
-            if fn in m:
-                self.out[fn] = ujson.dumps(ujson.loads(m[fn]))
+
+        metadata = asyncio.run(self._read_meta_files(m, [".zgroup", ".zattrs"]))
+        self.out.update(metadata)
         logger.debug("Written global metadata")
         self.done.add(2)
 
@@ -460,8 +474,8 @@ def second_pass(self):
 
         for i, fs in enumerate(self.fss):
             to_download = {}
-            m = fs.get_mapper("")
-            z = zarr.open(m)
+            m = fs_as_store(fs, read_only=False)
+            z = zarr.open(m, zarr_format=2)
 
             if no_deps is None:
                 # done first time only
@@ -491,9 +505,8 @@ def second_pass(self):
                 if f"{v}/.zgroup" in fns:
                     # recurse into groups - copy meta, add to dirs to process and don't look
                     # for references in this dir
-                    self.out[f"{v}/.zgroup"] = m[f"{v}/.zgroup"]
-                    if f"{v}/.zattrs" in fns:
-                        self.out[f"{v}/.zattrs"] = m[f"{v}/.zattrs"]
+                    metadata = asyncio.run(self._read_meta_files(m, [f"{v}/.zgroup", f"{v}/.zattrs"]))
+                    self.out.update(metadata)
                     dirs.extend([f for f in fns if not f.startswith(f"{v}/.z")])
                     continue
                 if v in self.identical_dims:
@@ -504,8 +517,9 @@ def second_pass(self):
                             self.out[k] = fs.references[k]
                     continue
                 logger.debug("Second pass: %s, %s", i, v)
-
-                zarray = ujson.loads(m[f"{v}/.zarray"])
+
+                zarray = asyncio.run(self._read_meta_files(m, [f"{v}/.zarray"]))[f"{v}/.zarray"]
+                zarray = ujson.loads(zarray)
                 if v not in chunk_sizes:
                     chunk_sizes[v] = zarray["chunks"]
                 elif chunk_sizes[v] != zarray["chunks"]:
@@ -516,7 +530,8 @@ def second_pass(self):
                         chunks so far: {zarray["chunks"]}"""
                     )
                 chunks = chunk_sizes[v]
-                zattrs = ujson.loads(m.get(f"{v}/.zattrs", "{}"))
+                zattr_meta = asyncio.run(self._read_meta_files(m, [f"{v}/.zattrs"]))
+                zattrs = ujson.loads(zattr_meta.get(f"{v}/.zattrs", {}))
                 coords = zattrs.get("_ARRAY_DIMENSIONS", [])
                 if zarray["shape"] and not coords:
                     coords = list("ikjlm")[: len(zarray["shape"])]

diff --git a/kerchunk/fits.py b/kerchunk/fits.py
@@ -8,7 +8,7 @@
 from fsspec.implementations.reference import LazyReferenceMapper
 
 
-from kerchunk.utils import class_factory
+from kerchunk.utils import class_factory, dict_to_store
 from kerchunk.codecs import AsciiTableCodec, VarArrCodec
 
 try:
@@ -72,7 +72,8 @@ def process_file(
 
     storage_options = storage_options or {}
     out = out or {}
-    g = zarr.open(out)
+    store = dict_to_store(out)
+    g = zarr.open_group(store=store, zarr_format=2)
 
     with fsspec.open(url, mode="rb", **storage_options) as f:
         infile = fits.open(f, do_not_scale_image_data=True)
@@ -150,7 +151,7 @@ def process_file(
                         for name in dtype.names
                         if hdu.columns[name].format.startswith(("P", "Q"))
                     }
-                    kwargs["object_codec"] = VarArrCodec(
+                    kwargs["compressor"] = VarArrCodec(
                         str(dtype), str(dt2), nrows, types
                     )
                     dtype = dt2
@@ -164,7 +165,7 @@ def process_file(
             # TODO: we could sub-chunk on biggest dimension
             name = hdu.name or str(ext)
             arr = g.empty(
-                name, dtype=dtype, shape=shape, chunks=shape, compression=None, **kwargs
+                name=name, dtype=dtype, shape=shape, chunks=shape, zarr_format=2, **kwargs
             )
             arr.attrs.update(
                 {
@@ -248,7 +249,7 @@ def add_wcs_coords(hdu, zarr_group=None, dataset=None, dtype="float32"):
         }
         if zarr_group is not None:
             arr = zarr_group.empty(
-                name, shape=shape, chunks=shape, overwrite=True, dtype=dtype
+                name, shape=shape, chunks=shape, dtype=dtype, exists_ok=True
             )
             arr.attrs.update(attrs)
             arr[:] = world_coord.value.reshape(shape)