diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 46fd1cb806..0f50264be8 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -36,7 +36,7 @@ async def _decode_single( chunk_numpy_array = ensure_ndarray(chunk_bytes.as_array_like()) # ensure correct dtype - if str(chunk_numpy_array.dtype) != chunk_spec.dtype: + if str(chunk_numpy_array.dtype) != chunk_spec.dtype and not chunk_spec.dtype.hasobject: chunk_numpy_array = chunk_numpy_array.view(chunk_spec.dtype) return get_ndbuffer_class().from_numpy_array(chunk_numpy_array) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ef4921d46f..02bad40c35 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -35,6 +35,7 @@ ShapeLike, ZarrFormat, concurrent_map, + parse_dtype, parse_shapelike, product, ) @@ -365,16 +366,17 @@ async def create( ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: store_path = await make_store_path(store) + dtype_parsed = parse_dtype(dtype, zarr_format) shape = parse_shapelike(shape) if chunks is not None and chunk_shape is not None: raise ValueError("Only one of chunk_shape or chunks can be provided.") - dtype = np.dtype(dtype) if chunks: - _chunks = normalize_chunks(chunks, shape, dtype.itemsize) + _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) else: - _chunks = normalize_chunks(chunk_shape, shape, dtype.itemsize) + _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) + result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 3: if dimension_separator is not None: @@ -396,7 +398,7 @@ async def create( result = await cls._create_v3( store_path, shape=shape, - dtype=dtype, + dtype=dtype_parsed, chunk_shape=_chunks, fill_value=fill_value, chunk_key_encoding=chunk_key_encoding, @@ -406,6 +408,14 @@ async def create( exists_ok=exists_ok, ) elif zarr_format == 2: + if dtype is str or dtype == "str": + # another special case: zarr v2 added the vlen-utf8 codec + vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} + if filters and not any(x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [vlen_codec] + else: + filters = [vlen_codec] + if codecs is not None: raise ValueError( "codecs cannot be used for arrays with version 2. Use filters and compressor instead." @@ -419,7 +429,7 @@ async def create( result = await cls._create_v2( store_path, shape=shape, - dtype=dtype, + dtype=dtype_parsed, chunks=_chunks, dimension_separator=dimension_separator, fill_value=fill_value, diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index efce0f98f2..0bc6245cb5 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -14,6 +14,10 @@ overload, ) +import numpy as np + +from zarr.core.strings import _STRING_DTYPE + if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -151,3 +155,13 @@ def parse_order(data: Any) -> Literal["C", "F"]: if data in ("C", "F"): return cast(Literal["C", "F"], data) raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") + + +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: + if dtype is str or dtype == "str": + if zarr_format == 2: + # special case as object + return np.dtype("object") + else: + return _STRING_DTYPE + return np.dtype(dtype) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index d1dd86880d..c5f34d2776 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -321,7 +321,7 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: """ if dtype.kind == "S": return b"" - elif dtype.kind == "U": + elif dtype.kind in "UO": return "" else: return dtype.type(0) diff --git a/tests/v3/test_codecs/test_vlen.py b/tests/v3/test_codecs/test_vlen.py index ca5ccb92fa..aaea5dab83 100644 --- a/tests/v3/test_codecs/test_vlen.py +++ b/tests/v3/test_codecs/test_vlen.py @@ -11,7 +11,7 @@ from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage.common import StorePath -numpy_str_dtypes: list[type | str | None] = [None, str, np.dtypes.StrDType] +numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType] expected_zarr_string_dtype: np.dtype[Any] if _NUMPY_SUPPORTS_VLEN_STRING: numpy_str_dtypes.append(np.dtypes.StringDType) diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py index d981fbc893..729ed0533f 100644 --- a/tests/v3/test_v2.py +++ b/tests/v3/test_v2.py @@ -2,6 +2,7 @@ from collections.abc import Iterator from typing import Any +import numcodecs.vlen import numpy as np import pytest from numcodecs import Delta @@ -44,7 +45,7 @@ def test_simple(store: StorePath) -> None: ("float64", 0.0), ("|S1", b""), ("|U1", ""), - ("object", 0), + ("object", ""), (str, ""), ], ) @@ -53,7 +54,12 @@ def test_implicit_fill_value(store: StorePath, dtype: str, fill_value: Any) -> N assert arr.metadata.fill_value is None assert arr.metadata.to_dict()["fill_value"] is None result = arr[:] - expected = np.full(arr.shape, fill_value, dtype=dtype) + if dtype is str: + # special case + numpy_dtype = np.dtype(object) + else: + numpy_dtype = np.dtype(dtype) + expected = np.full(arr.shape, fill_value, dtype=numpy_dtype) np.testing.assert_array_equal(result, expected) @@ -106,3 +112,14 @@ async def test_v2_encode_decode(dtype): data = zarr.open_array(store=store, path="foo")[:] expected = np.full((3,), b"X", dtype=dtype) np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype", [str, "str"]) +async def test_create_dtype_str(dtype: Any) -> None: + arr = zarr.create(shape=3, dtype=dtype, zarr_format=2) + assert arr.dtype.kind == "O" + assert arr.metadata.to_dict()["dtype"] == "|O" + assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),) + arr[:] = ["a", "bb", "ccc"] + result = arr[:] + np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object"))