From 206d145a5aedfccdbd2b44e4c40916b14530b239 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Nov 2024 10:37:17 -0600 Subject: [PATCH] Added Array.info_complete (#2514) Now that Store.getsize is a thing, we can do info_complete which includes the number of chunks written and the size of those bytes. Co-authored-by: Davis Bennett Co-authored-by: Norman Rzepka --- src/zarr/core/array.py | 53 ++++++++++++++++++++++----- tests/test_array.py | 82 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1e815d4d0e..71a6f9d380 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1346,18 +1346,53 @@ def info(self) -> Any: AsyncArray.info_complete All information about a group, including dynamic information like the number of bytes and chunks written. + + Examples + -------- + + >>> arr = await zarr.api.asynchronous.create( + ... path="array", shape=(3, 4, 5), chunks=(2, 2, 2)) + ... ) + >>> arr.info + Type : Array + Zarr format : 3 + Data type : DataType.float64 + Shape : (3, 4, 5) + Chunk shape : (2, 2, 2) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }] + No. bytes : 480 """ return self._info() async def info_complete(self) -> Any: - # TODO: get the size of the object from the store. - extra = { - "count_chunks_initialized": await self.nchunks_initialized(), - # count_bytes_stored isn't yet implemented. - } - return self._info(extra=extra) - - def _info(self, extra: dict[str, int] | None = None) -> Any: + """ + Return all the information for an array, including dynamic information like a storage size. + + In addition to the static information, this provides + + - The count of chunks initialized + - The sum of the bytes written + + Returns + ------- + ArrayInfo + + See Also + -------- + AsyncArray.info + A property giving just the statically known information about an array. + """ + return self._info( + await self.nchunks_initialized(), + await self.store_path.store.getsize_prefix(self.store_path.path), + ) + + def _info( + self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None + ) -> Any: kwargs: dict[str, Any] = {} if self.metadata.zarr_format == 2: assert isinstance(self.metadata, ArrayV2Metadata) @@ -1386,6 +1421,8 @@ def _info(self, extra: dict[str, int] | None = None) -> Any: _read_only=self.read_only, _store_type=type(self.store_path.store).__name__, _count_bytes=self.dtype.itemsize * self.size, + _count_bytes_stored=count_bytes_stored, + _count_chunks_initialized=count_chunks_initialized, **kwargs, ) diff --git a/tests/test_array.py b/tests/test_array.py index f0f36cf70d..86da801d1f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1,3 +1,4 @@ +import dataclasses import json import math import pickle @@ -474,6 +475,87 @@ def test_info_v3(self) -> None: ) assert result == expected + def test_info_complete(self) -> None: + arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + result = arr.info_complete() + expected = ArrayInfo( + _zarr_format=3, + _data_type=DataType.parse("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=128, + _count_chunks_initialized=0, + _count_bytes_stored=373, # the metadata? + ) + assert result == expected + + arr[:2, :2] = 10 + result = arr.info_complete() + expected = dataclasses.replace( + expected, _count_chunks_initialized=1, _count_bytes_stored=405 + ) + assert result == expected + + async def test_info_v2_async(self) -> None: + arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=2) + result = arr.info + expected = ArrayInfo( + _zarr_format=2, + _data_type=np.dtype("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _count_bytes=128, + ) + assert result == expected + + async def test_info_v3_async(self) -> None: + arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + result = arr.info + expected = ArrayInfo( + _zarr_format=3, + _data_type=DataType.parse("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=128, + ) + assert result == expected + + async def test_info_complete_async(self) -> None: + arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3) + result = await arr.info_complete() + expected = ArrayInfo( + _zarr_format=3, + _data_type=DataType.parse("float64"), + _shape=(4, 4), + _chunk_shape=(2, 2), + _order="C", + _read_only=False, + _store_type="MemoryStore", + _codecs=[BytesCodec()], + _count_bytes=128, + _count_chunks_initialized=0, + _count_bytes_stored=373, # the metadata? + ) + assert result == expected + + await arr.setitem((slice(2), slice(2)), 10) + result = await arr.info_complete() + expected = dataclasses.replace( + expected, _count_chunks_initialized=1, _count_bytes_stored=405 + ) + assert result == expected + @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize("zarr_format", [2, 3])