From 2629f422cc21a4776edff9371397e827597f664f Mon Sep 17 00:00:00 2001 From: Oli McCormack Date: Thu, 12 Oct 2023 00:41:15 -0600 Subject: [PATCH 1/5] Avoid redundant metadata reads in `ZarrArrayWrapper` Modify ZarrArrayWrapper to avoid re-reading metadata each time data is read from the array. --- xarray/backends/zarr.py | 16 ++++++++------- xarray/tests/test_backends.py | 38 +++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f88523422bb..d6ad15f4f87 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -61,27 +61,29 @@ def encode_zarr_attr_value(value): class ZarrArrayWrapper(BackendArray): - __slots__ = ("datastore", "dtype", "shape", "variable_name") + __slots__ = ("datastore", "dtype", "shape", "variable_name", "_array") def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name - array = self.get_array() - self.shape = array.shape + # some callers attempt to evaluate an array if an `array` property exists on the object. + # we prefix with _ to avoid this inference. + self._array = self.datastore.zarr_group[self.variable_name] + self.shape = self._array.shape # preserve vlen string object dtype (GH 7328) - if array.filters is not None and any( - [filt.codec_id == "vlen-utf8" for filt in array.filters] + if self._array.filters is not None and any( + [filt.codec_id == "vlen-utf8" for filt in self._array.filters] ): dtype = coding.strings.create_vlen_dtype(str) else: - dtype = array.dtype + dtype = self._array.dtype self.dtype = dtype def get_array(self): - return self.datastore.zarr_group[self.variable_name] + return self._array def _oindex(self, key): return self.get_array().oindex[key] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0cbf3af3664..117c2460be1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -19,6 +19,7 @@ from os import listdir from pathlib import Path from typing import TYPE_CHECKING, Any, Final, cast +from unittest.mock import patch import numpy as np import pandas as pd @@ -2862,6 +2863,43 @@ def create_zarr_target(self): yield tmp +@requires_zarr +class TestZarrArrayWrapperCalls(TestZarrKVStoreV3): + def test_avoid_excess_metadata_calls(self) -> None: + """Test that chunk requests do not trigger redundant metadata requests. + + This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls + to retrieve chunk data after initialization do not trigger additional + metadata requests. + + https://github.com/pydata/xarray/issues/8290 + """ + + import zarr + + ds = xr.Dataset(data_vars={"test": (("Z"), np.array([np.nan]).reshape(1))}) + + # The call to retrieve metadata performs a group lookup. We patch the group, allowing + # assessment of call details to the target group.__getitem__ calls. + with self.create_zarr_target() as store, patch.object( + zarr.hierarchy.Group, + "__getitem__", + side_effect=zarr.hierarchy.Group.__getitem__, + autospec=True, + ) as mock: + ds.to_zarr(store, mode="w") + + # we expect this to request array metadata information, so call_count should be > 1 + xrds = xr.open_zarr(store) + call_count = mock.call_count + assert call_count > 0 + + # computing requests chunks, which should not trigger additional metadata requests + # we assert that the number of calls has not increased after fetchhing the array + xrds.test.compute(scheduler="sync") + assert mock.call_count == call_count + + @requires_zarr @requires_fsspec def test_zarr_storage_options() -> None: From 5c8f5d6070b9dd00b9f5583a224e29b6f8d9b4b1 Mon Sep 17 00:00:00 2001 From: Oli McCormack Date: Thu, 12 Oct 2023 07:42:17 -0600 Subject: [PATCH 2/5] Improve test documentation --- xarray/tests/test_backends.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 117c2460be1..d95d379b01f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2879,22 +2879,26 @@ def test_avoid_excess_metadata_calls(self) -> None: ds = xr.Dataset(data_vars={"test": (("Z"), np.array([np.nan]).reshape(1))}) - # The call to retrieve metadata performs a group lookup. We patch the group, allowing - # assessment of call details to the target group.__getitem__ calls. - with self.create_zarr_target() as store, patch.object( - zarr.hierarchy.Group, - "__getitem__", - side_effect=zarr.hierarchy.Group.__getitem__, - autospec=True, - ) as mock: + # The call to retrieve metadata performs a group lookup. We patch Group.__getitem__ + # so that we can inspect calls to this method - specifically count of calls. + # Use of side_effect means that calls are passed through to the original method + # rather than a mocked method. + Group = zarr.hierarchy.Group + with ( + self.create_zarr_target() as store, + patch.object( + Group, "__getitem__", side_effect=Group.__getitem__, autospec=True + ) as mock, + ): ds.to_zarr(store, mode="w") - # we expect this to request array metadata information, so call_count should be > 1 + # We expect this to request array metadata information, so call_count should be >= 1, + # At time of writing, 2 calls are made xrds = xr.open_zarr(store) call_count = mock.call_count assert call_count > 0 - # computing requests chunks, which should not trigger additional metadata requests + # compute() requests array data, which should not trigger additional metadata requests # we assert that the number of calls has not increased after fetchhing the array xrds.test.compute(scheduler="sync") assert mock.call_count == call_count From 36830845fd1ce224ea6455e6c33ef2df2e97d04c Mon Sep 17 00:00:00 2001 From: olimcc Date: Thu, 12 Oct 2023 15:44:50 -0700 Subject: [PATCH 3/5] Update xarray/tests/test_backends.py Co-authored-by: Joe Hamman --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2dcce50b3b2..18f84e2df42 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2877,7 +2877,7 @@ def test_avoid_excess_metadata_calls(self) -> None: import zarr - ds = xr.Dataset(data_vars={"test": (("Z"), np.array([np.nan]).reshape(1))}) + ds = xr.Dataset(data_vars={"test": (("Z", ), np.array([123]).reshape(1))}) # The call to retrieve metadata performs a group lookup. We patch Group.__getitem__ # so that we can inspect calls to this method - specifically count of calls. From 996d5a7ee4e4eb6a1ae099dba26f372026a7c97d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Oct 2023 22:46:19 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 18f84e2df42..4c04841db8d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2877,7 +2877,7 @@ def test_avoid_excess_metadata_calls(self) -> None: import zarr - ds = xr.Dataset(data_vars={"test": (("Z", ), np.array([123]).reshape(1))}) + ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))}) # The call to retrieve metadata performs a group lookup. We patch Group.__getitem__ # so that we can inspect calls to this method - specifically count of calls. From 87ea7a33cfc1cd935b8879a9a3ba1affd437a96c Mon Sep 17 00:00:00 2001 From: Oli McCormack Date: Fri, 13 Oct 2023 10:06:37 -0600 Subject: [PATCH 5/5] Add what's new entry --- doc/whats-new.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 40c50e158ad..4a57faaa82e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -54,6 +54,7 @@ Deprecations Bug fixes ~~~~~~~~~ + - :py:meth:`DataArray.rename` & :py:meth:`Dataset.rename` would emit a warning when the operation was a no-op. (:issue:`8266`) By `Simon Hansen `_. @@ -64,6 +65,12 @@ Bug fixes (:issue:`8271`, :pull:`8272`). By `Spencer Clark `_. +- Fix excess metadata requests when using a Zarr store. Prior to this, metadata + was re-read every time data was retrieved from the array, now metadata is retrieved only once + when they array is initialized. + (:issue:`8290`, :pull:`8297`). + By `Oliver McCormack `_. + Documentation ~~~~~~~~~~~~~