first stab at issue #118

zarr-developers · May 16, 2024 · 39d7735 · 39d7735
1 parent 2d61d1a
commit 39d7735
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 6 deletions.
diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -38,6 +38,7 @@ class FileType(AutoName):
     tiff = auto()
     fits = auto()
     zarr = auto()
+    kerchunk = auto()
 
 
 class NumpyEncoder(json.JSONEncoder):
@@ -117,7 +118,6 @@ def _automatically_determine_filetype(
     fpath = _fsspec_openfile_from_filepath(
         filepath=filepath, reader_options=reader_options
     )
-
     if file_extension == ".nc":
         # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167
         magic = fpath.read()

diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
@@ -7,7 +7,7 @@
 
 from virtualizarr.kerchunk import FileType, _automatically_determine_filetype
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.xarray import dataset_from_kerchunk_refs
+from virtualizarr.xarray import dataset_from_kerchunk_refs, open_virtual_dataset
 
 
 def gen_ds_refs(
@@ -232,3 +232,26 @@ def test_FileType():
     assert "zarr" == FileType("zarr").name
     with pytest.raises(ValueError):
         FileType(None)
+
+
+@pytest.mark.parametrize(
+    "format",
+    [
+        "json",
+        pytest.param(
+            "parquet", marks=pytest.mark.xfail(reason="parquet reading not finished")
+        ),
+    ],
+)
+def test_kerchunk_to_virtual_dataset(netcdf4_file, tmpdir, format):
+    vds = open_virtual_dataset(netcdf4_file, indexes={})
+
+    # QUESTION: should these live in a fixture? ex. kerchunk_ref_fpath_json, kerchunk_ref_fpath_parquet
+    vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
+
+    rt_vds = open_virtual_dataset(
+        filepath=f"{tmpdir}/refs.{format}", filetype="kerchunk"
+    )
+
+    # this fails. rt_vds is missing Attributes:_ARRAY_DIMENSIONS:  ['lat'].
+    xrt.assert_equal(vds, rt_vds)
diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py
@@ -41,7 +41,10 @@ def _fsspec_openfile_from_filepath(
     protocol = universal_filepath.protocol
 
     if protocol == "":
-        fpath = fsspec.open(filepath, "rb").open()
+        # import pdb; pdb.set_trace()
+        fpath = fsspec.open(universal_filepath, "rb")
+        if universal_filepath.is_file():
+            fpath = fpath.open()
 
     elif protocol in ["s3"]:
         s3_anon_defaults = {"key": "", "secret": "", "anon": True}
@@ -53,8 +56,9 @@ def _fsspec_openfile_from_filepath(
 
             # using dict merge operator to add in defaults if keys are not specified
             storage_options = s3_anon_defaults | storage_options
-
-        fpath = fsspec.filesystem(protocol, **storage_options).open(filepath)
+        fpath = fsspec.filesystem(protocol, **storage_options)
+        if universal_filepath.is_file():
+            fpath = fpath.open(filepath)
 
     else:
         raise NotImplementedError(

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
@@ -54,7 +54,7 @@ def open_virtual_dataset(
         File path to open as a set of virtualized zarr arrays.
     filetype : FileType, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
-        Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
+        Can be one of {'netCDF3', 'netCDF4', 'zarr_v3', 'kerchunk'}.
         If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
     drop_variables: list[str], default is None
         Variables in the file to drop before returning.
@@ -102,6 +102,35 @@ def open_virtual_dataset(
         return open_virtual_dataset_from_v3_store(
             storepath=filepath, drop_variables=drop_variables, indexes=indexes
         )
+
+    if filetype == "kerchunk":
+        fpath = _fsspec_openfile_from_filepath(
+            filepath=filepath, reader_options=reader_options
+        )
+
+        from upath import UPath
+
+        kerchunk_storage_ftype = UPath(fpath.path).suffix
+
+        if kerchunk_storage_ftype == ".json":
+            import json
+
+            refs_dict = json.loads(fpath.read().decode("utf-8"))
+
+            vds = dataset_from_kerchunk_refs(refs_dict)
+            return vds
+        elif kerchunk_storage_ftype == ".parquet":
+            raise NotImplementedError
+
+            # Question: How should we read the parquet files
+            # into a dict to pass into dataset_from_kerchunk_refs?
+            # pandas, pyarrow table, duckdb?
+
+            # pd example retrieves: {'path': '...virtual_datas0/air.nc', 'offset': 15431, 'size': 7738000, 'raw': None}
+            # import pandas as pd
+            # refs_dict = pd.read_parquet(fpath.path).iloc[0].to_dict()
+            # vds = dataset_from_kerchunk_refs(refs_dict)
+
     else:
         # this is the only place we actually always need to use kerchunk directly
         # TODO avoid even reading byte ranges for variables that will be dropped later anyway?