From 7d06d2ef07a50cc055d758609bead886b80b55f9 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 30 Jan 2024 12:17:52 -0500 Subject: [PATCH 1/8] Add test for appending with cftime --- kerchunk/tests/test_combine.py | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index 3e08dc67..7174e0ae 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -174,6 +174,19 @@ b'1970-01-01T00:00:00"}', ) +tdata1 = xr.DataArray( + data=arr, + coords={"time": np.array([3])}, + dims=["time", "x", "y"], + name="data", +) +xr.Dataset({"data": tdata1}).to_zarr("memory://cfstdtime3.zarr") +fs.pipe( + "cfstdtime3.zarr/time/.zattrs", + b'{"_ARRAY_DIMENSIONS": ["time"], "units": "seconds since ' + b'1970-01-01T00:00:00"}', +) + # cftime arrays - non standard tdata1 = xr.DataArray( data=arr, @@ -345,6 +358,45 @@ def test_single_append(refs): assert z.time.values.tolist() == [1, 2, 3] +def test_single_append_cf(refs): + mzz = MultiZarrToZarr( + [refs["cfstdtime1"], refs["cfstdtime2"]], + remote_protocol="memory", + concat_dims=["time"], + ) + out = mzz.translate() + # mzz = MultiZarrToZarr.append( + # [refs["cfstdtime3"]], + # out, + # remote_protocol="memory", + # concat_dims=["time"], + # ) + # out = mzz.translate() + z = xr.open_dataset( + "reference://", + backend_kwargs={ + "storage_options": {"fo": out, "remote_protocol": "memory"}, + "consolidated": False, + }, + engine="zarr", + ) + assert z.data.shape == (3, 10, 10) + assert out["refs"]["data/0.0.0"] == ["memory:///cfstdtime1.zarr/data/0.0.0"] + assert out["refs"]["data/1.0.0"] == ["memory:///cfstdtime2.zarr/data/0.0.0"] + assert out["refs"]["data/3.0.0"] == ["memory:///cfstdtime3.zarr/data/0.0.0"] + np.testing.assert_equal( + z.time.values, + np.array( + [ + "1970-01-01T00:00:01.000000000", + "1970-01-01T00:00:02.000000000", + "1970-01-01T00:00:03.000000000", + ], + dtype="datetime64[ns]", + ), + ) + + def test_single_append_parquet(refs): from fsspec.implementations.reference import LazyReferenceMapper From f8e0d270d2aa13dbb00842e6c3e6150f4f351d4c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 30 Jan 2024 12:18:52 -0500 Subject: [PATCH 2/8] Uncomment append --- kerchunk/tests/test_combine.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index 7174e0ae..f41de12b 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -365,13 +365,13 @@ def test_single_append_cf(refs): concat_dims=["time"], ) out = mzz.translate() - # mzz = MultiZarrToZarr.append( - # [refs["cfstdtime3"]], - # out, - # remote_protocol="memory", - # concat_dims=["time"], - # ) - # out = mzz.translate() + mzz = MultiZarrToZarr.append( + [refs["cfstdtime3"]], + out, + remote_protocol="memory", + concat_dims=["time"], + ) + out = mzz.translate() z = xr.open_dataset( "reference://", backend_kwargs={ From 2c8475d2c5144f0dfe84cfd5f1d17dfcdaff2961 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 30 Jan 2024 14:42:29 -0500 Subject: [PATCH 3/8] Fix cf time without cf (also fix drop cords in kerchunk engine) --- kerchunk/combine.py | 3 ++- kerchunk/tests/test_combine.py | 2 +- kerchunk/xarray_backend.py | 11 ++--------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 0aa4c68c..559e011c 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -201,6 +201,7 @@ def append( ds = xr.open_dataset( fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False} ) + z = zarr.open(fs.get_mapper()) mzz = MultiZarrToZarr( path, out=fs.references, # dict or parquet/lazy @@ -235,7 +236,7 @@ def append( mzz.coos[var].add(value2) else: - mzz.coos[var] = set(ds[var].values) + mzz.coos[var] = set(z[var][:]) return mzz @property diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index f41de12b..34a1bd95 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -383,7 +383,7 @@ def test_single_append_cf(refs): assert z.data.shape == (3, 10, 10) assert out["refs"]["data/0.0.0"] == ["memory:///cfstdtime1.zarr/data/0.0.0"] assert out["refs"]["data/1.0.0"] == ["memory:///cfstdtime2.zarr/data/0.0.0"] - assert out["refs"]["data/3.0.0"] == ["memory:///cfstdtime3.zarr/data/0.0.0"] + assert out["refs"]["data/2.0.0"] == ["memory:///cfstdtime3.zarr/data/0.0.0"] np.testing.assert_equal( z.time.values, np.array( diff --git a/kerchunk/xarray_backend.py b/kerchunk/xarray_backend.py index badf2907..ca377f6d 100644 --- a/kerchunk/xarray_backend.py +++ b/kerchunk/xarray_backend.py @@ -6,21 +6,14 @@ class KerchunkBackend(BackendEntrypoint): def open_dataset( - self, - filename_or_obj, - *, - drop_variables=None, - storage_options=None, - open_dataset_options=None + self, filename_or_obj, *, storage_options=None, open_dataset_options=None, **kw ): - + open_dataset_options = (open_dataset_options or {}) | kw ref_ds = open_reference_dataset( filename_or_obj, storage_options=storage_options, open_dataset_options=open_dataset_options, ) - if drop_variables is not None: - ref_ds = ref_ds.drop_vars(drop_variables) return ref_ds open_dataset_parameters = [ From ec5e24ca502e66a36162c56235aaa104060bbad0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 30 Jan 2024 15:00:41 -0500 Subject: [PATCH 4/8] early convert --- kerchunk/combine.py | 2 ++ kerchunk/tests/test_combine.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 559e011c..69ec3cc6 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -337,6 +337,8 @@ def _get_value(self, index, z, var, fn=None): self.cf_units[var] = dict(units=units, calendar=calendar) else: o = selector # must be a non-number constant - error? + if var in self.coo_dtypes: + o = np.array(o, dtype=self.coo_dtypes[var]) logger.debug("Decode: %s -> %s", (selector, index, var, fn), o) return o diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index 34a1bd95..64e6e5c5 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -358,11 +358,15 @@ def test_single_append(refs): assert z.time.values.tolist() == [1, 2, 3] -def test_single_append_cf(refs): +@pytest.mark.parametrize("mapper", [{}, {"time": "cf:time"}]) +@pytest.mark.parametrize("dtype", [{"time": "M8[s]"}, {}]) +def test_single_append_cf(refs, mapper, dtype): mzz = MultiZarrToZarr( [refs["cfstdtime1"], refs["cfstdtime2"]], remote_protocol="memory", concat_dims=["time"], + coo_map=mapper, + coo_dtypes=dtype, ) out = mzz.translate() mzz = MultiZarrToZarr.append( @@ -370,6 +374,8 @@ def test_single_append_cf(refs): out, remote_protocol="memory", concat_dims=["time"], + coo_map=mapper, + coo_dtypes=dtype, ) out = mzz.translate() z = xr.open_dataset( From 95d8171eb404e0a645e3b4012d463d5c296ed964 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:41:27 -0500 Subject: [PATCH 5/8] Cache mamba environment to speed up tests --- .github/workflows/tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80c3286c..3d77c8c5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,6 +16,8 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/environment-py${{matrix.python-version}}.yml + cache-downloads: false + cache-environment: true - name: Install kerchunk shell: bash -l {0} run: | From 1b2cf77e0a6a53779c285a539ab4c9b3cb01bf18 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 30 Jan 2024 16:57:47 -0500 Subject: [PATCH 6/8] later python --- .github/workflows/tests.yml | 2 +- ci/{environment-py38.yml => environment-py311.yml} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename ci/{environment-py38.yml => environment-py311.yml} (96%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80c3286c..63c117ff 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [38, 39, 310] + python-version: [39, 310, 311] steps: - uses: actions/checkout@v4 diff --git a/ci/environment-py38.yml b/ci/environment-py311.yml similarity index 96% rename from ci/environment-py38.yml rename to ci/environment-py311.yml index 3d276036..a2228952 100644 --- a/ci/environment-py38.yml +++ b/ci/environment-py311.yml @@ -3,7 +3,7 @@ channels: - conda-forge - defaults dependencies: - - python=3.8 + - python=3.11 - dask - zarr - xarray From 97fb0e982df07a04adb951b92e5f09125eef0893 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:50:21 -0500 Subject: [PATCH 7/8] Don't generate run shell --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d54c6394..dd83bea5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,6 +18,7 @@ jobs: environment-file: ci/environment-py${{matrix.python-version}}.yml cache-downloads: false cache-environment: true + generate-run-shell: false - name: Install kerchunk shell: bash -l {0} run: | From 55496d862c74e76fd8edc038016f344738eacc3c Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 2 Feb 2024 13:15:36 -0500 Subject: [PATCH 8/8] Nodefaults --- ci/environment-py310.yml | 2 +- ci/environment-py311.yml | 2 +- ci/environment-py39.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/environment-py310.yml b/ci/environment-py310.yml index 6c760a37..021b150f 100644 --- a/ci/environment-py310.yml +++ b/ci/environment-py310.yml @@ -1,7 +1,7 @@ name: test_env channels: - conda-forge - - defaults + - nodefaults dependencies: - python=3.10 - dask diff --git a/ci/environment-py311.yml b/ci/environment-py311.yml index a2228952..d680ae71 100644 --- a/ci/environment-py311.yml +++ b/ci/environment-py311.yml @@ -1,7 +1,7 @@ name: test_env channels: - conda-forge - - defaults + - nodefaults dependencies: - python=3.11 - dask diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml index f5f8f90d..e4ca09ad 100644 --- a/ci/environment-py39.yml +++ b/ci/environment-py39.yml @@ -1,7 +1,7 @@ name: test_env channels: - conda-forge - - defaults + - nodefaults dependencies: - python=3.9 - dask