Skip to content

Commit

Permalink
first stab at issue #118
Browse files Browse the repository at this point in the history
  • Loading branch information
norlandrhagen committed May 16, 2024
1 parent 2d61d1a commit 39d7735
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 6 deletions.
2 changes: 1 addition & 1 deletion virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class FileType(AutoName):
tiff = auto()
fits = auto()
zarr = auto()
kerchunk = auto()


class NumpyEncoder(json.JSONEncoder):
Expand Down Expand Up @@ -117,7 +118,6 @@ def _automatically_determine_filetype(
fpath = _fsspec_openfile_from_filepath(
filepath=filepath, reader_options=reader_options
)

if file_extension == ".nc":
# based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167
magic = fpath.read()
Expand Down
25 changes: 24 additions & 1 deletion virtualizarr/tests/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from virtualizarr.kerchunk import FileType, _automatically_determine_filetype
from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
from virtualizarr.xarray import dataset_from_kerchunk_refs
from virtualizarr.xarray import dataset_from_kerchunk_refs, open_virtual_dataset


def gen_ds_refs(
Expand Down Expand Up @@ -232,3 +232,26 @@ def test_FileType():
assert "zarr" == FileType("zarr").name
with pytest.raises(ValueError):
FileType(None)


@pytest.mark.parametrize(
"format",
[
"json",
pytest.param(
"parquet", marks=pytest.mark.xfail(reason="parquet reading not finished")
),
],
)
def test_kerchunk_to_virtual_dataset(netcdf4_file, tmpdir, format):
vds = open_virtual_dataset(netcdf4_file, indexes={})

# QUESTION: should these live in a fixture? ex. kerchunk_ref_fpath_json, kerchunk_ref_fpath_parquet
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

rt_vds = open_virtual_dataset(
filepath=f"{tmpdir}/refs.{format}", filetype="kerchunk"
)

# this fails. rt_vds is missing Attributes:_ARRAY_DIMENSIONS: ['lat'].
xrt.assert_equal(vds, rt_vds)
10 changes: 7 additions & 3 deletions virtualizarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ def _fsspec_openfile_from_filepath(
protocol = universal_filepath.protocol

if protocol == "":
fpath = fsspec.open(filepath, "rb").open()
# import pdb; pdb.set_trace()
fpath = fsspec.open(universal_filepath, "rb")
if universal_filepath.is_file():
fpath = fpath.open()

elif protocol in ["s3"]:
s3_anon_defaults = {"key": "", "secret": "", "anon": True}
Expand All @@ -53,8 +56,9 @@ def _fsspec_openfile_from_filepath(

# using dict merge operator to add in defaults if keys are not specified
storage_options = s3_anon_defaults | storage_options

fpath = fsspec.filesystem(protocol, **storage_options).open(filepath)
fpath = fsspec.filesystem(protocol, **storage_options)
if universal_filepath.is_file():
fpath = fpath.open(filepath)

else:
raise NotImplementedError(
Expand Down
31 changes: 30 additions & 1 deletion virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def open_virtual_dataset(
File path to open as a set of virtualized zarr arrays.
filetype : FileType, default None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
Can be one of {'netCDF3', 'netCDF4', 'zarr_v3', 'kerchunk'}.
If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
drop_variables: list[str], default is None
Variables in the file to drop before returning.
Expand Down Expand Up @@ -102,6 +102,35 @@ def open_virtual_dataset(
return open_virtual_dataset_from_v3_store(
storepath=filepath, drop_variables=drop_variables, indexes=indexes
)

if filetype == "kerchunk":
fpath = _fsspec_openfile_from_filepath(
filepath=filepath, reader_options=reader_options
)

from upath import UPath

kerchunk_storage_ftype = UPath(fpath.path).suffix

if kerchunk_storage_ftype == ".json":
import json

refs_dict = json.loads(fpath.read().decode("utf-8"))

vds = dataset_from_kerchunk_refs(refs_dict)
return vds
elif kerchunk_storage_ftype == ".parquet":
raise NotImplementedError

# Question: How should we read the parquet files
# into a dict to pass into dataset_from_kerchunk_refs?
# pandas, pyarrow table, duckdb?

# pd example retrieves: {'path': '...virtual_datas0/air.nc', 'offset': 15431, 'size': 7738000, 'raw': None}
# import pandas as pd
# refs_dict = pd.read_parquet(fpath.path).iloc[0].to_dict()
# vds = dataset_from_kerchunk_refs(refs_dict)

else:
# this is the only place we actually always need to use kerchunk directly
# TODO avoid even reading byte ranges for variables that will be dropped later anyway?
Expand Down

0 comments on commit 39d7735

Please sign in to comment.