From dbf76a9518d7b6820ff25c56589d35486dc17a3f Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Fri, 25 Mar 2022 08:09:06 +0100 Subject: [PATCH 01/31] add support for NCZarr --- xarray/backends/zarr.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index aca0b8064f5..875649ece03 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -178,19 +178,40 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): raise AssertionError("We should never get here. Function logic must be wrong.") +def _get_nczarr_dims(zarr_obj): + # NCZarr defines dimensions through metadata in .zarray + zarray_path = os.path.join(zarr_obj.path, ".zarray") + zarray = zarr.util.json_loads(zarr_obj._store[zarray_path]) + # NCZarr uses Fully Qualified Names + dimensions = [os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]] + return dimensions + + +def _hide_nczarr_attrs(attrs): + return HiddenKeyDict(attrs, [attr for attr in attrs if attr.startswith("_NC")]) + + def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): # Zarr arrays do not have dimensions. To get around this problem, we add # an attribute that specifies the dimension. We have to hide this attribute # when we send the attributes to the user. # zarr_obj can be either a zarr group or zarr array + attributes = _hide_nczarr_attrs(zarr_obj.attrs) try: + # Xarray-Zarr dimensions = zarr_obj.attrs[dimension_key] except KeyError: - raise KeyError( - f"Zarr object is missing the attribute `{dimension_key}`, which is " - "required for xarray to determine variable dimensions." - ) - attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key]) + try: + # NCZarr + dimensions = _get_nczarr_dims(zarr_obj) + attributes = dict(attributes) + attributes[dimension_key] = dimensions + except KeyError: + raise KeyError( + f"Zarr object is missing the attribute `{dimension_key}`, which is " + "required for xarray to determine variable dimensions." + ) + attributes = HiddenKeyDict(attributes, [dimension_key]) return dimensions, attributes @@ -374,6 +395,7 @@ def open_group( zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) + return cls( zarr_group, mode, @@ -430,7 +452,7 @@ def get_variables(self): ) def get_attrs(self): - return dict(self.zarr_group.attrs.asdict()) + return dict(_hide_nczarr_attrs(self.zarr_group.attrs.asdict())) def get_dimensions(self): dimensions = {} From 07a334f8236a0539fdb382df3c37b168cbb3a454 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Fri, 25 Mar 2022 08:12:57 +0100 Subject: [PATCH 02/31] restore original format --- xarray/backends/zarr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 875649ece03..fa6dacf1d25 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -395,7 +395,6 @@ def open_group( zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) - return cls( zarr_group, mode, From d1e9120ecd3f3a933bcd24a5b066659921a40314 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Fri, 25 Mar 2022 13:43:42 +0100 Subject: [PATCH 03/31] add test_nczarr --- xarray/tests/test_backends.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 825c6f7130f..222fb7c7ce8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5427,3 +5427,14 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None: txr = tdf.to_xarray() txr.to_netcdf(tmpdir.join("test.nc")) + + +@requires_zarr +@requires_netCDF4 +def test_nczarr(): + # dim3 has dtype=' Date: Mon, 28 Mar 2022 09:07:36 +0200 Subject: [PATCH 04/31] better comment --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 222fb7c7ce8..2ca2fb23d06 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5432,7 +5432,8 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None: @requires_zarr @requires_netCDF4 def test_nczarr(): - # dim3 has dtype=' Date: Mon, 28 Mar 2022 17:47:12 +0200 Subject: [PATCH 05/31] test reading with zarr --- xarray/tests/test_backends.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2ca2fb23d06..1518b12e25c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5439,3 +5439,17 @@ def test_nczarr(): expected.to_netcdf(f"file://{tmp}#mode=nczarr") actual = xr.open_zarr(tmp, consolidated=False) assert_identical(expected, actual) + + +@requires_zarr +@requires_netCDF4 +def test_open_nczarr_with_zarr(): + # TODO: This is a debugging test to check whether + # zarr can't open nczarr on windows. + # REMOVE BEFORE MERGING + import zarr + + expected = create_test_data().drop_vars("dim3") + with create_tmp_file(suffix=".zarr") as tmp: + expected.to_netcdf(f"file://{tmp}#mode=nczarr") + zarr.open_group(tmp) From e35d79344b469c0b8749daab70145c63574641f9 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Mon, 28 Mar 2022 19:23:55 +0200 Subject: [PATCH 06/31] decode zarray --- xarray/backends/zarr.py | 2 +- xarray/tests/test_backends.py | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index fa6dacf1d25..27fe9bbb52b 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -181,7 +181,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): def _get_nczarr_dims(zarr_obj): # NCZarr defines dimensions through metadata in .zarray zarray_path = os.path.join(zarr_obj.path, ".zarray") - zarray = zarr.util.json_loads(zarr_obj._store[zarray_path]) + zarray = zarr.util.json_loads(zarr_obj._store[zarray_path].decode()) # NCZarr uses Fully Qualified Names dimensions = [os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]] return dimensions diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1518b12e25c..2ca2fb23d06 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5439,17 +5439,3 @@ def test_nczarr(): expected.to_netcdf(f"file://{tmp}#mode=nczarr") actual = xr.open_zarr(tmp, consolidated=False) assert_identical(expected, actual) - - -@requires_zarr -@requires_netCDF4 -def test_open_nczarr_with_zarr(): - # TODO: This is a debugging test to check whether - # zarr can't open nczarr on windows. - # REMOVE BEFORE MERGING - import zarr - - expected = create_test_data().drop_vars("dim3") - with create_tmp_file(suffix=".zarr") as tmp: - expected.to_netcdf(f"file://{tmp}#mode=nczarr") - zarr.open_group(tmp) From eac9b3bf6d65f67bda075234779c63b3dcecf26e Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Mon, 28 Mar 2022 20:47:13 +0200 Subject: [PATCH 07/31] use public store and test nczarr only --- .github/workflows/ci-additional.yaml | 4 ++-- .github/workflows/ci.yaml | 2 +- .github/workflows/upstream-dev-ci.yaml | 2 +- xarray/backends/zarr.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 50c95cdebb7..f8eddf50a9c 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -103,7 +103,7 @@ jobs: python -c "import xarray" - name: Run tests run: | - python -m pytest -n 4 \ + python -m pytest -n 4 -k nczarr\ --cov=xarray \ --cov-report=xml \ $PYTEST_EXTRA_FLAGS @@ -150,7 +150,7 @@ jobs: python xarray/util/print_versions.py - name: Run doctests run: | - python -m pytest --doctest-modules xarray --ignore xarray/tests + python -m pytest -k nczarr --doctest-modules xarray --ignore xarray/tests min-version-policy: name: Minimum Version Policy diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4747b5ae20d..92456fb9a9c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -91,7 +91,7 @@ jobs: run: | python -c "import xarray" - name: Run tests - run: python -m pytest -n 4 + run: python -m pytest -k nczarr -n 4 --cov=xarray --cov-report=xml --junitxml=pytest.xml diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 6091306ed8b..7ccd1ee6284 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -83,7 +83,7 @@ jobs: id: status run: | set -euo pipefail - python -m pytest --timeout=60 -rf | tee output-${{ matrix.python-version }}-log || ( + python -m pytest -k nczarr --timeout=60 -rf | tee output-${{ matrix.python-version }}-log || ( echo '::set-output name=ARTIFACTS_AVAILABLE::true' && false ) - name: Upload artifacts diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 27fe9bbb52b..700357b1a80 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -181,7 +181,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): def _get_nczarr_dims(zarr_obj): # NCZarr defines dimensions through metadata in .zarray zarray_path = os.path.join(zarr_obj.path, ".zarray") - zarray = zarr.util.json_loads(zarr_obj._store[zarray_path].decode()) + zarray = zarr.util.json_loads(zarr_obj.store[zarray_path]) # NCZarr uses Fully Qualified Names dimensions = [os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]] return dimensions From 8af176cbd2a16e800c1d9df32f078291ae178790 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Mon, 28 Mar 2022 22:16:01 +0200 Subject: [PATCH 08/31] restore tests --- .github/workflows/ci-additional.yaml | 4 ++-- .github/workflows/ci.yaml | 2 +- .github/workflows/upstream-dev-ci.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index f8eddf50a9c..50c95cdebb7 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -103,7 +103,7 @@ jobs: python -c "import xarray" - name: Run tests run: | - python -m pytest -n 4 -k nczarr\ + python -m pytest -n 4 \ --cov=xarray \ --cov-report=xml \ $PYTEST_EXTRA_FLAGS @@ -150,7 +150,7 @@ jobs: python xarray/util/print_versions.py - name: Run doctests run: | - python -m pytest -k nczarr --doctest-modules xarray --ignore xarray/tests + python -m pytest --doctest-modules xarray --ignore xarray/tests min-version-policy: name: Minimum Version Policy diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 92456fb9a9c..4747b5ae20d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -91,7 +91,7 @@ jobs: run: | python -c "import xarray" - name: Run tests - run: python -m pytest -k nczarr -n 4 + run: python -m pytest -n 4 --cov=xarray --cov-report=xml --junitxml=pytest.xml diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 7ccd1ee6284..6091306ed8b 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -83,7 +83,7 @@ jobs: id: status run: | set -euo pipefail - python -m pytest -k nczarr --timeout=60 -rf | tee output-${{ matrix.python-version }}-log || ( + python -m pytest --timeout=60 -rf | tee output-${{ matrix.python-version }}-log || ( echo '::set-output name=ARTIFACTS_AVAILABLE::true' && false ) - name: Upload artifacts From ac32ac81bd567dba7760018c7804cd7c9ea40b4d Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 09:04:54 +0200 Subject: [PATCH 09/31] install netcdf-c fixing bug --- .github/workflows/ci.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4747b5ae20d..54167dfdf64 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -78,6 +78,23 @@ jobs: run: | mamba env update -f $CONDA_ENV_FILE + - name: Install netcdf-c fixing bug + run: | + sudo apt-get update + sudo apt-get install mpich libmpich-dev libhdf5-mpich-dev libcurl4-openssl-dev + echo "Download and build netCDF github master" + git clone https://github.com/Unidata/netcdf-c + sed -i 's/\"Nan\"/\"NaN\"/g' netcdf-c/libnczarr/zcvt.c + pushd netcdf-c + export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" + export LDFLAGS="-L${NETCDF_DIR}/lib" + export LIBS="-lhdf5_mpich_hl -lhdf5_mpich -lm -lz" + autoreconf -i + ./configure --prefix $NETCDF_DIR --enable-netcdf-4 --enable-shared --enable-dap --enable-parallel4 + make -j 2 + make install + popd + - name: Install xarray run: | python -m pip install --no-deps -e . From 44ef220c57295a0ef019b1cbbc27910b0040de5a Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 09:14:16 +0200 Subject: [PATCH 10/31] add env --- .github/workflows/ci.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 54167dfdf64..99d3b5fe87d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -39,6 +39,9 @@ jobs: shell: bash -l {0} strategy: fail-fast: false + env: + NETCDF_DIR: ${{ github.workspace }}/.. + CC: mpicc.mpich matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] # Bookend python versions From b72e1d4fcfb4c2ba23d33b117e0e463e2e7d698b Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 09:24:41 +0200 Subject: [PATCH 11/31] fix ci --- .github/workflows/ci.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 99d3b5fe87d..042ae471970 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -32,6 +32,9 @@ jobs: test: name: ${{ matrix.os }} py${{ matrix.python-version }} runs-on: ${{ matrix.os }} + env: + NETCDF_DIR: ${{ github.workspace }}/.. + CC: mpicc.mpich needs: detect-ci-trigger if: needs.detect-ci-trigger.outputs.triggered == 'false' defaults: @@ -39,9 +42,6 @@ jobs: shell: bash -l {0} strategy: fail-fast: false - env: - NETCDF_DIR: ${{ github.workspace }}/.. - CC: mpicc.mpich matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] # Bookend python versions @@ -87,7 +87,7 @@ jobs: sudo apt-get install mpich libmpich-dev libhdf5-mpich-dev libcurl4-openssl-dev echo "Download and build netCDF github master" git clone https://github.com/Unidata/netcdf-c - sed -i 's/\"Nan\"/\"NaN\"/g' netcdf-c/libnczarr/zcvt.c + sed -i 's/\"Nan\"/\"NaN\"/g' netcdf-c/libnczarr/zcvt.c pushd netcdf-c export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" export LDFLAGS="-L${NETCDF_DIR}/lib" From fd84283e65da671b02db4290e5800f5d36be3c3f Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 10:21:23 +0200 Subject: [PATCH 12/31] try build netcdf-c on windows --- .github/workflows/ci.yaml | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 042ae471970..f20b0534439 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -32,14 +32,14 @@ jobs: test: name: ${{ matrix.os }} py${{ matrix.python-version }} runs-on: ${{ matrix.os }} - env: - NETCDF_DIR: ${{ github.workspace }}/.. - CC: mpicc.mpich needs: detect-ci-trigger if: needs.detect-ci-trigger.outputs.triggered == 'false' defaults: run: shell: bash -l {0} + env: + NETCDF_DIR: ${{ github.workspace }}/.. + CC: mpicc.mpich strategy: fail-fast: false matrix: @@ -50,6 +50,12 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 # Fetch all history for all branches and tags. + - uses: msys2/setup-msys2@v2 + if: matrix.os == 'windows-latest' + with: + msystem: MINGW64 + update: true + install: git mingw-w64-x86_64-toolchain automake libtool autoconf make cmake mingw-w64-x86_64-hdf5 unzip - name: Set environment variables run: | if [[ ${{ matrix.os }} == windows* ]] ; @@ -80,20 +86,17 @@ jobs: - name: Install conda dependencies run: | mamba env update -f $CONDA_ENV_FILE - - - name: Install netcdf-c fixing bug + + - name: Install netcdf-c + if: matrix.os == 'windows-latest' run: | - sudo apt-get update - sudo apt-get install mpich libmpich-dev libhdf5-mpich-dev libcurl4-openssl-dev - echo "Download and build netCDF github master" - git clone https://github.com/Unidata/netcdf-c - sed -i 's/\"Nan\"/\"NaN\"/g' netcdf-c/libnczarr/zcvt.c + git clone https://github.com/bopen/netcdf-ci -b fix_nan pushd netcdf-c export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" export LDFLAGS="-L${NETCDF_DIR}/lib" export LIBS="-lhdf5_mpich_hl -lhdf5_mpich -lm -lz" autoreconf -i - ./configure --prefix $NETCDF_DIR --enable-netcdf-4 --enable-shared --enable-dap --enable-parallel4 + ./configure --prefix $NETCDF_DIR --enable-netcdf-4 --enable-shared --enable-dap --enable-parallel4 make -j 2 make install popd From 3355eb7a3195a7d796f426624a0247a8648f4704 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 10:41:54 +0200 Subject: [PATCH 13/31] fix typo --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f20b0534439..30b9a0704ef 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -43,7 +43,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + os: ["windows-latest"] # Bookend python versions python-version: ["3.8", "3.9", "3.10"] steps: @@ -90,7 +90,7 @@ jobs: - name: Install netcdf-c if: matrix.os == 'windows-latest' run: | - git clone https://github.com/bopen/netcdf-ci -b fix_nan + git clone https://github.com/bopen/netcdf-c -b fix_nan pushd netcdf-c export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" export LDFLAGS="-L${NETCDF_DIR}/lib" From 9af44017a07bd7c83475cdecd49064ba4793d8b4 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 11:22:15 +0200 Subject: [PATCH 14/31] install netcdf-c first --- .github/workflows/ci.yaml | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 30b9a0704ef..e2186f39ff3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -56,6 +56,15 @@ jobs: msystem: MINGW64 update: true install: git mingw-w64-x86_64-toolchain automake libtool autoconf make cmake mingw-w64-x86_64-hdf5 unzip + - name: Install netcdf-c + if: matrix.os == 'windows-latest' + run: | + git clone https://github.com/bopen/netcdf-c -b fix_nan + pushd netcdf-c + autoreconf -if + ./configure --enable-hdf5 --enable-dap --disable-dap-remote-tests --disable-static --disable-plugins --disable-byterange --disable-dap-remote-tests --disable-logging + make -j 8 LDFLAGS="-no-undefined -Wl,--export-all-symbols" + popd - name: Set environment variables run: | if [[ ${{ matrix.os }} == windows* ]] ; @@ -87,20 +96,6 @@ jobs: run: | mamba env update -f $CONDA_ENV_FILE - - name: Install netcdf-c - if: matrix.os == 'windows-latest' - run: | - git clone https://github.com/bopen/netcdf-c -b fix_nan - pushd netcdf-c - export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" - export LDFLAGS="-L${NETCDF_DIR}/lib" - export LIBS="-lhdf5_mpich_hl -lhdf5_mpich -lm -lz" - autoreconf -i - ./configure --prefix $NETCDF_DIR --enable-netcdf-4 --enable-shared --enable-dap --enable-parallel4 - make -j 2 - make install - popd - - name: Install xarray run: | python -m pip install --no-deps -e . From 12ef99158a752bcc3afbf78bcf91df43b54d5048 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 12:04:53 +0200 Subject: [PATCH 15/31] install netcdf-c dep with conda --- .github/workflows/ci.yaml | 40 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e2186f39ff3..4ee48ad310c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -33,38 +33,23 @@ jobs: name: ${{ matrix.os }} py${{ matrix.python-version }} runs-on: ${{ matrix.os }} needs: detect-ci-trigger + env: + NETCDF_DIR: ${{ github.workspace }}/.. + CC: mpicc.mpich if: needs.detect-ci-trigger.outputs.triggered == 'false' defaults: run: shell: bash -l {0} - env: - NETCDF_DIR: ${{ github.workspace }}/.. - CC: mpicc.mpich strategy: fail-fast: false matrix: os: ["windows-latest"] # Bookend python versions - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8"] steps: - uses: actions/checkout@v3 with: fetch-depth: 0 # Fetch all history for all branches and tags. - - uses: msys2/setup-msys2@v2 - if: matrix.os == 'windows-latest' - with: - msystem: MINGW64 - update: true - install: git mingw-w64-x86_64-toolchain automake libtool autoconf make cmake mingw-w64-x86_64-hdf5 unzip - - name: Install netcdf-c - if: matrix.os == 'windows-latest' - run: | - git clone https://github.com/bopen/netcdf-c -b fix_nan - pushd netcdf-c - autoreconf -if - ./configure --enable-hdf5 --enable-dap --disable-dap-remote-tests --disable-static --disable-plugins --disable-byterange --disable-dap-remote-tests --disable-logging - make -j 8 LDFLAGS="-no-undefined -Wl,--export-all-symbols" - popd - name: Set environment variables run: | if [[ ${{ matrix.os }} == windows* ]] ; @@ -95,7 +80,22 @@ jobs: - name: Install conda dependencies run: | mamba env update -f $CONDA_ENV_FILE - + + - name: Install netcdf-c + if: matrix.os == "windows-latest" + run: | + mamba install -c conda-forge autoconf mpich --yes + git clone https://github.com/bopen/netcdf-c -b fix_nan + pushd netcdf-c + export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" + export LDFLAGS="-L${NETCDF_DIR}/lib" + export LIBS="-lhdf5_mpich_hl -lhdf5_mpich -lm -lz" + autoreconf -i + ./configure --prefix $NETCDF_DIR --enable-netcdf-4 --enable-shared --enable-dap --enable-parallel4 + make -j 2 + make install + popd + - name: Install xarray run: | python -m pip install --no-deps -e . From 71eca465aa44ed534ede3dffc6fab5c490aa732d Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 12:12:45 +0200 Subject: [PATCH 16/31] fix ci --- .github/workflows/ci.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4ee48ad310c..9bbfe13a47c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -33,13 +33,13 @@ jobs: name: ${{ matrix.os }} py${{ matrix.python-version }} runs-on: ${{ matrix.os }} needs: detect-ci-trigger - env: - NETCDF_DIR: ${{ github.workspace }}/.. - CC: mpicc.mpich if: needs.detect-ci-trigger.outputs.triggered == 'false' defaults: run: shell: bash -l {0} + env: + NETCDF_DIR: ${{ github.workspace }}/.. + CC: mpicc.mpich strategy: fail-fast: false matrix: @@ -82,7 +82,7 @@ jobs: mamba env update -f $CONDA_ENV_FILE - name: Install netcdf-c - if: matrix.os == "windows-latest" + if: matrix.os == 'windows-latest' run: | mamba install -c conda-forge autoconf mpich --yes git clone https://github.com/bopen/netcdf-c -b fix_nan From d3e91821b0d613faa4651086a3c862845de9dd61 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 12:37:57 +0200 Subject: [PATCH 17/31] try win env again --- .github/workflows/ci.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9bbfe13a47c..2d318dcf55d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -22,6 +22,12 @@ jobs: outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: + - uses: msys2/setup-msys2@v2 + if: matrix.os == 'windows-latest' + with: + msystem: MINGW64 + update: true + install: git mingw-w64-x86_64-toolchain automake libtool autoconf make cmake mingw-w64-x86_64-hdf5 unzip - uses: actions/checkout@v3 with: fetch-depth: 2 @@ -84,7 +90,6 @@ jobs: - name: Install netcdf-c if: matrix.os == 'windows-latest' run: | - mamba install -c conda-forge autoconf mpich --yes git clone https://github.com/bopen/netcdf-c -b fix_nan pushd netcdf-c export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" From 316153bdca5474f9253c1490a4333a61ef3272ba Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 13:26:07 +0200 Subject: [PATCH 18/31] fix Nan in tests --- .github/workflows/ci.yaml | 27 ++------------------------- xarray/tests/test_backends.py | 9 +++++++++ 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2d318dcf55d..4747b5ae20d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -22,12 +22,6 @@ jobs: outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: - - uses: msys2/setup-msys2@v2 - if: matrix.os == 'windows-latest' - with: - msystem: MINGW64 - update: true - install: git mingw-w64-x86_64-toolchain automake libtool autoconf make cmake mingw-w64-x86_64-hdf5 unzip - uses: actions/checkout@v3 with: fetch-depth: 2 @@ -43,15 +37,12 @@ jobs: defaults: run: shell: bash -l {0} - env: - NETCDF_DIR: ${{ github.workspace }}/.. - CC: mpicc.mpich strategy: fail-fast: false matrix: - os: ["windows-latest"] + os: ["ubuntu-latest", "macos-latest", "windows-latest"] # Bookend python versions - python-version: ["3.8"] + python-version: ["3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v3 with: @@ -87,20 +78,6 @@ jobs: run: | mamba env update -f $CONDA_ENV_FILE - - name: Install netcdf-c - if: matrix.os == 'windows-latest' - run: | - git clone https://github.com/bopen/netcdf-c -b fix_nan - pushd netcdf-c - export CPPFLAGS="-I/usr/include/hdf5/mpich -I${NETCDF_DIR}/include" - export LDFLAGS="-L${NETCDF_DIR}/lib" - export LIBS="-lhdf5_mpich_hl -lhdf5_mpich -lm -lz" - autoreconf -i - ./configure --prefix $NETCDF_DIR --enable-netcdf-4 --enable-shared --enable-dap --enable-parallel4 - make -j 2 - make install - popd - - name: Install xarray run: | python -m pip install --no-deps -e . diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2ca2fb23d06..9513adb10cb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4,6 +4,7 @@ import math import os.path import pickle +import platform import re import shutil import sys @@ -5437,5 +5438,13 @@ def test_nczarr(): expected = create_test_data().drop_vars("dim3") with create_tmp_file(suffix=".zarr") as tmp: expected.to_netcdf(f"file://{tmp}#mode=nczarr") + if platform.system() == "Windows": + # Bug in netcdf-c: https://github.com/Unidata/netcdf-c/issues/2265 + for var in expected.data_vars: + zattrs_path = os.path.join(tmp, var, ".zattrs") + with open(zattrs_path) as f: + zattrs = f.read() + with open(zattrs_path, "w") as f: + f.write(zattrs.replace("Nan", "NaN")) actual = xr.open_zarr(tmp, consolidated=False) assert_identical(expected, actual) From 978f7535498d3f8b1a6fd70d97db9a45ad68fe07 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 13:52:46 +0200 Subject: [PATCH 19/31] edit zarray --- xarray/tests/test_backends.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9513adb10cb..4b768266b80 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5441,10 +5441,10 @@ def test_nczarr(): if platform.system() == "Windows": # Bug in netcdf-c: https://github.com/Unidata/netcdf-c/issues/2265 for var in expected.data_vars: - zattrs_path = os.path.join(tmp, var, ".zattrs") - with open(zattrs_path) as f: - zattrs = f.read() - with open(zattrs_path, "w") as f: - f.write(zattrs.replace("Nan", "NaN")) + zarray_path = os.path.join(tmp, var, ".zarray") + with open(zarray_path) as f: + zarray = f.read() + with open(zarray_path, "w") as f: + f.write(zarray.replace("Nan", "NaN")) actual = xr.open_zarr(tmp, consolidated=False) assert_identical(expected, actual) From 5be903bcc94bedf5af1df284d1b9d63891b6e83c Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 14:24:26 +0200 Subject: [PATCH 20/31] loop over all variables --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4b768266b80..158a7a72bdf 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5440,7 +5440,7 @@ def test_nczarr(): expected.to_netcdf(f"file://{tmp}#mode=nczarr") if platform.system() == "Windows": # Bug in netcdf-c: https://github.com/Unidata/netcdf-c/issues/2265 - for var in expected.data_vars: + for var in expected.variables: zarray_path = os.path.join(tmp, var, ".zarray") with open(zarray_path) as f: zarray = f.read() From f520e7fa21b522f5ac3f8581e5d6dbc3b3d608b1 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 15:00:51 +0200 Subject: [PATCH 21/31] edit Nan in zattrs and zarray --- xarray/tests/test_backends.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 158a7a72bdf..69d787304b4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5441,10 +5441,11 @@ def test_nczarr(): if platform.system() == "Windows": # Bug in netcdf-c: https://github.com/Unidata/netcdf-c/issues/2265 for var in expected.variables: - zarray_path = os.path.join(tmp, var, ".zarray") - with open(zarray_path) as f: - zarray = f.read() - with open(zarray_path, "w") as f: - f.write(zarray.replace("Nan", "NaN")) + for zfile in (".zarray", ".zattrs"): + zfile_path = os.path.join(tmp, var, zfile) + with open(zfile_path) as f: + zread = f.read() + with open(zfile_path, "w") as f: + f.write(zread.replace("Nan", "NaN")) actual = xr.open_zarr(tmp, consolidated=False) assert_identical(expected, actual) From b5609a1cd3351574b2d24b2801e5c40ef032aa3d Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 15:35:18 +0200 Subject: [PATCH 22/31] check path exists --- xarray/tests/test_backends.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 69d787304b4..b4f187741cb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5443,6 +5443,8 @@ def test_nczarr(): for var in expected.variables: for zfile in (".zarray", ".zattrs"): zfile_path = os.path.join(tmp, var, zfile) + if not os.path.exists(zfile_path): + continue with open(zfile_path) as f: zread = f.read() with open(zfile_path, "w") as f: From 3a22ac8d8e2c72f359501198d0f6c838749cee5b Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 16:43:58 +0200 Subject: [PATCH 23/31] must use netcdf-c>=4.8.1 --- xarray/tests/test_backends.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b4f187741cb..fe024fcd683 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1447,7 +1447,7 @@ def test_encoding_chunksizes_unlimited(self): "complevel": 0, "fletcher32": False, "contiguous": False, - "chunksizes": (2**20,), + "chunksizes": (2 ** 20,), "original_shape": (3,), } with self.roundtrip(ds) as actual: @@ -5433,21 +5433,22 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None: @requires_zarr @requires_netCDF4 def test_nczarr(): + netcdfc_version = Version(nc4.getlibversion().split()[0]) + if netcdfc_version < Version("4.8.1"): + pytest.skip("requires netcdf-c>=4.8.1") + + expected = create_test_data() # Drop dim3: netcdf-c does not support dtype='4.8.1 will add _ARRAY_DIMENSIONS by default + mode = "nczarr" if netcdfc_version == Version("4.8.1") else "nczarr,noxarray" + expected.to_netcdf(f"file://{tmp}#mode={mode}") actual = xr.open_zarr(tmp, consolidated=False) assert_identical(expected, actual) From 7f19413bf6db87f6248c3a3e9431b6ffe37c705f Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Wed, 30 Mar 2022 17:28:30 +0200 Subject: [PATCH 24/31] skip 4.8.1 and Windows --- xarray/tests/test_backends.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fe024fcd683..7061926fd98 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1447,7 +1447,7 @@ def test_encoding_chunksizes_unlimited(self): "complevel": 0, "fletcher32": False, "contiguous": False, - "chunksizes": (2 ** 20,), + "chunksizes": (2**20,), "original_shape": (3,), } with self.roundtrip(ds) as actual: @@ -5436,16 +5436,15 @@ def test_nczarr(): netcdfc_version = Version(nc4.getlibversion().split()[0]) if netcdfc_version < Version("4.8.1"): pytest.skip("requires netcdf-c>=4.8.1") + if (platform.system() == "Windows") and (netcdfc_version == Version("4.8.1")): + # Bug in netcdf-c==4.8.1 (typo: Nan instead of NaN) + # https://github.com/Unidata/netcdf-c/issues/2265 + pytest.skip("netcdf-c==4.8.1 has issues on Windows") expected = create_test_data() # Drop dim3: netcdf-c does not support dtype='4.8.1 will add _ARRAY_DIMENSIONS by default mode = "nczarr" if netcdfc_version == Version("4.8.1") else "nczarr,noxarray" From b5704bdbc9e391d10815c88bb5baf4332c0b49ed Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Thu, 7 Apr 2022 08:38:33 +0200 Subject: [PATCH 25/31] revisions --- xarray/backends/zarr.py | 78 ++++++++++++++++++----------------- xarray/tests/test_backends.py | 15 +++++++ 2 files changed, 55 insertions(+), 38 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 700357b1a80..d95c8497340 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,3 +1,4 @@ +import json import os import warnings @@ -178,40 +179,39 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): raise AssertionError("We should never get here. Function logic must be wrong.") -def _get_nczarr_dims(zarr_obj): - # NCZarr defines dimensions through metadata in .zarray - zarray_path = os.path.join(zarr_obj.path, ".zarray") - zarray = zarr.util.json_loads(zarr_obj.store[zarray_path]) - # NCZarr uses Fully Qualified Names - dimensions = [os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]] - return dimensions - - -def _hide_nczarr_attrs(attrs): - return HiddenKeyDict(attrs, [attr for attr in attrs if attr.startswith("_NC")]) - - -def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): +def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): # Zarr arrays do not have dimensions. To get around this problem, we add # an attribute that specifies the dimension. We have to hide this attribute # when we send the attributes to the user. # zarr_obj can be either a zarr group or zarr array - attributes = _hide_nczarr_attrs(zarr_obj.attrs) try: # Xarray-Zarr dimensions = zarr_obj.attrs[dimension_key] except KeyError: - try: - # NCZarr - dimensions = _get_nczarr_dims(zarr_obj) - attributes = dict(attributes) - attributes[dimension_key] = dimensions - except KeyError: + if not try_nczarr: raise KeyError( f"Zarr object is missing the attribute `{dimension_key}`, which is " "required for xarray to determine variable dimensions." ) - attributes = HiddenKeyDict(attributes, [dimension_key]) + + # NCZarr defines dimensions through metadata in .zarray + zarray_path = os.path.join(zarr_obj.path, ".zarray") + zarray = json.loads(zarr_obj.store[zarray_path]) + try: + # NCZarr uses Fully Qualified Names + dimensions = [ + os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"] + ] + except KeyError: + raise KeyError( + f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, " + "which are required for xarray to determine variable dimensions." + ) + + attrs_to_hide = [dimension_key] + if try_nczarr: + attrs_to_hide += [attr for attr in zarr_obj.attrs if attr.startswith("_NC")] + attributes = HiddenKeyDict(zarr_obj.attrs, attrs_to_hide) return dimensions, attributes @@ -430,7 +430,10 @@ def ds(self): def open_store_variable(self, name, zarr_array): data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) - dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, DIMENSION_KEY) + try_nczarr = self._mode not in ["a", "r+"] + dimensions, attributes = _get_zarr_dims_and_attrs( + zarr_array, DIMENSION_KEY, try_nczarr + ) attributes = dict(attributes) encoding = { "chunks": zarr_array.chunks, @@ -451,26 +454,25 @@ def get_variables(self): ) def get_attrs(self): - return dict(_hide_nczarr_attrs(self.zarr_group.attrs.asdict())) + return { + k: v + for k, v in self.zarr_group.attrs.asdict().items() + if not k.startswith("_NC") + } def get_dimensions(self): + try_nczarr = self._mode not in ["a", "r+"] dimensions = {} for k, v in self.zarr_group.arrays(): - try: - for d, s in zip(v.attrs[DIMENSION_KEY], v.shape): - if d in dimensions and dimensions[d] != s: - raise ValueError( - f"found conflicting lengths for dimension {d} " - f"({s} != {dimensions[d]})" - ) - dimensions[d] = s + dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr) + for d, s in zip(dim_names, v.shape): + if d in dimensions and dimensions[d] != s: + raise ValueError( + f"found conflicting lengths for dimension {d} " + f"({s} != {dimensions[d]})" + ) + dimensions[d] = s - except KeyError: - raise KeyError( - f"Zarr object is missing the attribute `{DIMENSION_KEY}`, " - "which is required for xarray to determine " - "variable dimensions." - ) return dimensions def set_dimensions(self, variables, unlimited_dims=None): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7061926fd98..b261f1010e6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5451,3 +5451,18 @@ def test_nczarr(): expected.to_netcdf(f"file://{tmp}#mode={mode}") actual = xr.open_zarr(tmp, consolidated=False) assert_identical(expected, actual) + + # Do not allow appending or writing to existing NCZarr + with pytest.raises( + KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," + ): + expected.to_zarr(tmp, append_dim="dim3") + with pytest.raises( + KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," + ): + expected.to_zarr(tmp, mode="r+") + + # Allow overwriting existing NCZarr + expected.to_zarr(tmp, mode="w") + actual = xr.open_zarr(tmp, consolidated=False) + assert_identical(expected, actual) From 2c12935dbc4a137b5a6859c91189a3a6de7bd8dd Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Thu, 7 Apr 2022 10:22:31 +0200 Subject: [PATCH 26/31] better testing --- xarray/backends/zarr.py | 1 - xarray/tests/test_backends.py | 70 ++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d95c8497340..64ec703982a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -472,7 +472,6 @@ def get_dimensions(self): f"({s} != {dimensions[d]})" ) dimensions[d] = s - return dimensions def set_dimensions(self, variables, unlimited_dims=None): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b261f1010e6..a4967959a75 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5432,37 +5432,47 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None: @requires_zarr @requires_netCDF4 -def test_nczarr(): - netcdfc_version = Version(nc4.getlibversion().split()[0]) - if netcdfc_version < Version("4.8.1"): - pytest.skip("requires netcdf-c>=4.8.1") - if (platform.system() == "Windows") and (netcdfc_version == Version("4.8.1")): - # Bug in netcdf-c==4.8.1 (typo: Nan instead of NaN) - # https://github.com/Unidata/netcdf-c/issues/2265 - pytest.skip("netcdf-c==4.8.1 has issues on Windows") - - expected = create_test_data() - # Drop dim3: netcdf-c does not support dtype='=4.8.1") + if (platform.system() == "Windows") and (netcdfc_version == Version("4.8.1")): + # Bug in netcdf-c==4.8.1 (typo: Nan instead of NaN) + # https://github.com/Unidata/netcdf-c/issues/2265 + pytest.skip("netcdf-c==4.8.1 has issues on Windows") + + ds = create_test_data() + # Drop dim3: netcdf-c does not support dtype='4.8.1 will add _ARRAY_DIMENSIONS by default mode = "nczarr" if netcdfc_version == Version("4.8.1") else "nczarr,noxarray" - expected.to_netcdf(f"file://{tmp}#mode={mode}") - actual = xr.open_zarr(tmp, consolidated=False) - assert_identical(expected, actual) + ds.to_netcdf(f"file://{filename}#mode={mode}") + return ds - # Do not allow appending or writing to existing NCZarr - with pytest.raises( - KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," - ): - expected.to_zarr(tmp, append_dim="dim3") - with pytest.raises( - KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," - ): - expected.to_zarr(tmp, mode="r+") + def test_open_nczarr(self): + with create_tmp_file(suffix=".zarr") as tmp: + expected = self._create_nczarr(tmp) + actual = xr.open_zarr(tmp, consolidated=False) + assert_identical(expected, actual) - # Allow overwriting existing NCZarr - expected.to_zarr(tmp, mode="w") - actual = xr.open_zarr(tmp, consolidated=False) - assert_identical(expected, actual) + def test_overwriting_nczarr(self): + with create_tmp_file(suffix=".zarr") as tmp: + ds = self._create_nczarr(tmp) + expected = ds[["var1"]] + expected.to_zarr(tmp, mode="w") + actual = xr.open_zarr(tmp, consolidated=False) + assert_identical(expected, actual) + + @pytest.mark.parametrize("mode", ["a", "r+"]) + @pytest.mark.filterwarnings("ignore:.*non-consolidated metadata.*") + def test_raise_writing_to_nczarr(self, mode): + with create_tmp_file(suffix=".zarr") as tmp: + ds = self._create_nczarr(tmp) + with pytest.raises( + KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," + ): + ds.to_zarr(tmp, mode=mode) From 286d72c1e81a85fdeabe74dc8dcc20e35682cb4e Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Sat, 9 Apr 2022 11:23:50 +0200 Subject: [PATCH 27/31] revisions --- xarray/backends/zarr.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 64ec703982a..78a755ce6d6 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -187,12 +187,12 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): try: # Xarray-Zarr dimensions = zarr_obj.attrs[dimension_key] - except KeyError: + except KeyError as e: if not try_nczarr: raise KeyError( f"Zarr object is missing the attribute `{dimension_key}`, which is " "required for xarray to determine variable dimensions." - ) + ) from e # NCZarr defines dimensions through metadata in .zarray zarray_path = os.path.join(zarr_obj.path, ".zarray") @@ -202,16 +202,14 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): dimensions = [ os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"] ] - except KeyError: + except KeyError as e: raise KeyError( f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, " "which are required for xarray to determine variable dimensions." - ) + ) from e - attrs_to_hide = [dimension_key] - if try_nczarr: - attrs_to_hide += [attr for attr in zarr_obj.attrs if attr.startswith("_NC")] - attributes = HiddenKeyDict(zarr_obj.attrs, attrs_to_hide) + nc_attrs = [attr for attr in zarr_obj.attrs if attr.startswith("_NC")] + attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key] + nc_attrs) return dimensions, attributes @@ -430,7 +428,7 @@ def ds(self): def open_store_variable(self, name, zarr_array): data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) - try_nczarr = self._mode not in ["a", "r+"] + try_nczarr = self._mode == "r" dimensions, attributes = _get_zarr_dims_and_attrs( zarr_array, DIMENSION_KEY, try_nczarr ) @@ -461,7 +459,7 @@ def get_attrs(self): } def get_dimensions(self): - try_nczarr = self._mode not in ["a", "r+"] + try_nczarr = self._mode == "r" dimensions = {} for k, v in self.zarr_group.arrays(): dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr) From b8236758c1d2d13f3fe5711d5a4b3be2e73299b7 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Sat, 9 Apr 2022 20:48:03 +0200 Subject: [PATCH 28/31] add what's new --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 74120f068e5..98b3147b313 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,8 @@ v2022.03.1 (unreleased) New Features ~~~~~~~~~~~~ +- The `zarr` backend is now able to read NCZarr. + By `Mattia Almansi `_. - Add a weighted ``quantile`` method to :py:class:`~core.weighted.DatasetWeighted` and :py:class:`~core.weighted.DataArrayWeighted` (:pull:`6059`). By `Christian Jauvin `_ and `David Huard `_. From b6cfad3c83a4e93b6c03446d6f21eb921e3e8ee5 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Sun, 10 Apr 2022 10:46:32 +0200 Subject: [PATCH 29/31] update docs --- doc/internals/zarr-encoding-spec.rst | 6 ++++-- doc/user-guide/io.rst | 8 +++++++- xarray/backends/zarr.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/internals/zarr-encoding-spec.rst b/doc/internals/zarr-encoding-spec.rst index f8bffa6e82f..7f468b8b0db 100644 --- a/doc/internals/zarr-encoding-spec.rst +++ b/doc/internals/zarr-encoding-spec.rst @@ -32,9 +32,11 @@ the variable dimension names and then removed from the attributes dictionary returned to the user. Because of these choices, Xarray cannot read arbitrary array data, but only -Zarr data with valid ``_ARRAY_DIMENSIONS`` attributes on each array. +Zarr data with valid ``_ARRAY_DIMENSIONS`` or +`NCZarr `_ attributes +on each array (NCZarr dimension names are defined in the ``.zarray`` file). -After decoding the ``_ARRAY_DIMENSIONS`` attribute and assigning the variable +After decoding the ``_ARRAY_DIMENSIONS`` or NCZarr attribute and assigning the variable dimensions, Xarray proceeds to [optionally] decode each variable using its standard CF decoding machinery used for NetCDF data (see :py:func:`decode_cf`). diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index ddde0bf5888..7e906771d31 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -519,7 +519,7 @@ the ability to store and analyze datasets far too large fit onto disk Xarray can't open just any zarr dataset, because xarray requires special metadata (attributes) describing the dataset dimensions and coordinates. At this time, xarray can only open zarr datasets that have been written by -xarray. For implementation details, see :ref:`zarr_encoding`. +xarray or `NCZarr`_. For implementation details, see :ref:`zarr_encoding`. To write a dataset with zarr, we use the :py:meth:`Dataset.to_zarr` method. @@ -548,6 +548,11 @@ store is already present at that path, an error will be raised, preventing it from being overwritten. To override this behavior and overwrite an existing store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`. +.. note:: + + xarray does not write NCZarr attributes. Therefore, NCZarr data must be + opened in read-only mode. + To store variable length strings, convert them to object arrays first with ``dtype=object``. @@ -606,6 +611,7 @@ instance and pass this, as follows: .. _Amazon S3: https://aws.amazon.com/s3/ .. _Google Cloud Storage: https://cloud.google.com/storage/ .. _gcsfs: https://github.com/fsspec/gcsfs +.. _NCZarr: https://docs.unidata.ucar.edu/nug/current/nczarr_head.html Zarr Compressors and Filters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 78a755ce6d6..104f8aca58f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -665,7 +665,7 @@ def open_zarr( The `store` object should be a valid store for a Zarr group. `store` variables must contain dimension metadata encoded in the - `_ARRAY_DIMENSIONS` attribute. + `_ARRAY_DIMENSIONS` attribute or must have NCZarr format. Parameters ---------- From eb92cdecd752cc0fea8de2eefa59db0c65735cf8 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Mon, 11 Apr 2022 09:03:47 +0200 Subject: [PATCH 30/31] [skip ci] Mention netCDF and GDAL in user-guide --- doc/user-guide/io.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 7e906771d31..c80acfa79d9 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -518,8 +518,11 @@ the ability to store and analyze datasets far too large fit onto disk Xarray can't open just any zarr dataset, because xarray requires special metadata (attributes) describing the dataset dimensions and coordinates. -At this time, xarray can only open zarr datasets that have been written by -xarray or `NCZarr`_. For implementation details, see :ref:`zarr_encoding`. +At this time, xarray can only open zarr datasets with these additional attributes, +such as zarr datasets written by xarray, +`netCDF `_, +or `GDAL `_. +For implementation details, see :ref:`zarr_encoding`. To write a dataset with zarr, we use the :py:meth:`Dataset.to_zarr` method. @@ -611,7 +614,6 @@ instance and pass this, as follows: .. _Amazon S3: https://aws.amazon.com/s3/ .. _Google Cloud Storage: https://cloud.google.com/storage/ .. _gcsfs: https://github.com/fsspec/gcsfs -.. _NCZarr: https://docs.unidata.ucar.edu/nug/current/nczarr_head.html Zarr Compressors and Filters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 470210a552b32bf57fea56f68b93af2065aaab1a Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Mon, 11 Apr 2022 09:08:07 +0200 Subject: [PATCH 31/31] [skip ci] reword --- doc/user-guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index c80acfa79d9..81fa29bdf5f 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -518,7 +518,7 @@ the ability to store and analyze datasets far too large fit onto disk Xarray can't open just any zarr dataset, because xarray requires special metadata (attributes) describing the dataset dimensions and coordinates. -At this time, xarray can only open zarr datasets with these additional attributes, +At this time, xarray can only open zarr datasets with these special attributes, such as zarr datasets written by xarray, `netCDF `_, or `GDAL `_.