From ea536e60c3bd8d5dde27465d064940b302b5c58f Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 31 Oct 2025 15:38:04 +0000 Subject: [PATCH 1/7] Initial dataset wrappers. Rename; addin parts of old investigation; add temporary notes. --- .../netcdf/_bytecoding_datasets.py | 182 ++++++++++++++ .../fileformats/netcdf/_thread_safe_nc.py | 15 +- .../integration/netcdf/test_chararrays.py | 234 ++++++++++++++++++ .../fileformats/netcdf/encoding_tests.txt | 18 ++ .../netcdf/test_bytecoding_datasets.py | 14 ++ 5 files changed, 457 insertions(+), 6 deletions(-) create mode 100644 lib/iris/fileformats/netcdf/_bytecoding_datasets.py create mode 100644 lib/iris/tests/integration/netcdf/test_chararrays.py create mode 100644 lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py new file mode 100644 index 0000000000..41e801d103 --- /dev/null +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -0,0 +1,182 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Module providing to netcdf datasets with automatic character encoding. + +The requirement is to convert numpy fixed-width unicode arrays on writing to a variable +which is declared as a byte (character) array with a fixed-length string dimension. + +Numpy unicode string arrays are ones with dtypes of the form "U". +Numpy character variables have the dtype "S1", and map to a fixed-length "string +dimension". + +In principle, netCDF4 already performs these translations, but in practice current +releases are not functional for anything other than "ascii" encoding -- including UTF-8, +which is the most obvious and desirable "general" solution. + +There is also the question of whether we should like to implement UTF-8 as our default. +Current discussions on this are inconclusive and neither CF conventions nor the NetCDF +User Guide are definite on what possible values of "_Encoding" are, or what the effective +default is, even though they do both mention the "_Encoding" attribute as a potential +way to handle the issue. + +Because of this, we interpret as follows: + * when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to + decode bytes as UTF-8 + * when writing strings : in the absence of an "_Encoding" attribute (on the Iris + cube or coord object), we will attempt to encode data with "ascii" : If this fails, + it raise an error prompting the user to supply an "_Encoding" attribute. + +Where an "_Encoding" attribute is provided to Iris, we will honour it where possible, +identifying with "codecs.lookup" : This means we support the encodings in the Python +Standard Library, and the name aliases which it recognises. + +See: + +* known problems https://github.com/Unidata/netcdf4-python/issues/1440 +* suggestions for how this "ought" to work, discussed in the netcdf-c library + * https://github.com/Unidata/netcdf-c/issues/402 + +""" + +import codecs +import warnings + +import numpy as np + +from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper + + +def decode_bytesarray_to_stringarray( + byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None +) -> np.ndarray: + """Convert an array of bytes to an array of strings, with one less dimension. + + N.B. for now at least, we assume the string dim is **always the last one**. + If 'string_width' is not given, it is set to the final dimension of 'byte_array'. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_width is None: + string_width = bytes_shape[-1] + string_dtype = f"U{string_width}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +def encode_stringarray_as_bytearray( + data: np.ndarray, encoding=None, string_dimension_length: int | None = None +) -> np.ndarray: + """Encode strings as bytearray. + + Note: if 'string_dimension_length' is not given (None), it is set to the longest + encoded bytes element. If 'string_dimension_length' is specified, the last array + dimension is set to this and content strings are truncated or extended as required. + """ + element_shape = data.shape + max_length = 1 # this is a MINIMUM - i.e. not zero! + data_elements = np.zeros(element_shape, dtype=object) + for index in np.ndindex(element_shape): + data_element = data[index].encode(encoding=encoding) + element_length = len(data_element) + data_elements[index] = data_element + if element_length > max_length: + max_length = element_length + + if string_dimension_length is None: + string_dimension_length = max_length + + # We already encoded all the strings, but stored them in an object-array as + # we didn't yet know the fixed byte-length to convert to. + # Now convert to a fixed-width byte array with an extra string-length dimension + result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") + right_pad = b"\0" * string_dimension_length + for index in np.ndindex(element_shape): + bytes = data_elements[index] + bytes = (bytes + right_pad)[:string_dimension_length] + result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] + + return result + + +DEFAULT_ENCODING = "utf-8" + + +class EncodedVariable(VariableWrapper): + """A variable wrapper that translates variable data according to byte encodings.""" + + def __getitem__(self, keys): + if self.is_chardata(): + super().set_auto_chartostring(False) + + data = super().__getitem__(keys) + + if self.is_chardata(): + encoding = self.get_byte_encoding() + strlen = self.get_string_length() + data = decode_bytesarray_to_stringarray(data, encoding, strlen) + + return data + + def __setitem__(self, keys, data): + if self.is_chardata(): + encoding = self.get_byte_encoding() + strlen = self.get_string_length() + if encoding is not None: + data = encode_stringarray_as_bytearray(data, encoding, strlen) + else: + try: + # Check if all characters are valid ascii + data = encode_stringarray_as_bytearray(data, "ascii", strlen) + except UnicodeEncodeError: + data = encode_stringarray_as_bytearray( + data, DEFAULT_ENCODING, strlen + ) + # As this was necessary, record the new encoding on the variable + self.set_ncattr("_Encoding", DEFAULT_ENCODING) + msg = ( + f"Non-ascii data written to label variable {self.name}. " + f"Applied {DEFAULT_ENCODING!r} encoding, " + f"and set attribute _Encoding={DEFAULT_ENCODING!r}." + ) + warnings.warn(msg, UserWarning) + + super().set_auto_chartostring(False) + + super().__setitem__(keys, data) + + def is_chardata(self): + return np.issubdtype(self.dtype, np.bytes_) + + def get_encoding(self) -> str | None: + """Get the effective byte encoding to be used for this variable.""" + # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data + result = getattr(self, "_Encoding", None) + if result is not None: + try: + # Accept + normalise naming of encodings + result = codecs.lookup(result).name + # NOTE: if encoding does not suit data, errors can occur. + # For example, _Encoding = "ascii", with non-ascii content. + except LookupError: + # Replace some invalid setting with "safe"(ish) fallback. + msg = f"Unknown encoding for variable {self.name!r}: {result!r}" + warnings.warn(msg, UserWarning) + + return result + + def get_string_length(self): + """Return the string-length defined for this variable (or None).""" + return getattr(self, "iris_string_length", None) + + +class EncodedDataset(DatasetWrapper): + """A specialised DatasetWrapper whose variables perform byte encoding.""" + + VAR_WRAPPER_CLS = EncodedVariable diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 33183ef0fa..46b8609bb7 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper): CONTAINED_CLASS = netCDF4.Group # Note: will also accept a whole Dataset object, but that is OK. _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"] + # Class to use when creating variable wrappers (default=VariableWrapper). + # - needed to support _byte_encoded_data.EncodedDataset. + VAR_WRAPPER_CLS = VariableWrapper # All Group API that returns Dimension(s) is wrapped to instead return # DimensionWrapper(s). @@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]: """ with _GLOBAL_NETCDF4_LOCK: variables_ = self._contained_instance.variables - return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()} + return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()} def createVariable(self, *args, **kwargs) -> VariableWrapper: """Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK. @@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper: """ with _GLOBAL_NETCDF4_LOCK: new_variable = self._contained_instance.createVariable(*args, **kwargs) - return VariableWrapper.from_existing(new_variable) + return self.VAR_WRAPPER_CLS.from_existing(new_variable) def get_variables_by_attributes( self, *args, **kwargs @@ -234,7 +237,7 @@ def get_variables_by_attributes( variables_ = list( self._contained_instance.get_variables_by_attributes(*args, **kwargs) ) - return [VariableWrapper.from_existing(v) for v in variables_] + return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_] # All Group API that returns Group(s) is wrapped to instead return # GroupWrapper(s). @@ -252,7 +255,7 @@ def groups(self): """ with _GLOBAL_NETCDF4_LOCK: groups_ = self._contained_instance.groups - return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()} + return {k: self.__class__.from_existing(v) for k, v in groups_.items()} @property def parent(self): @@ -268,7 +271,7 @@ def parent(self): """ with _GLOBAL_NETCDF4_LOCK: parent_ = self._contained_instance.parent - return GroupWrapper.from_existing(parent_) + return self.__class__.from_existing(parent_) def createGroup(self, *args, **kwargs): """Call createGroup() from netCDF4.Group/Dataset. @@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs): """ with _GLOBAL_NETCDF4_LOCK: new_group = self._contained_instance.createGroup(*args, **kwargs) - return GroupWrapper.from_existing(new_group) + return self.__class__.from_existing(new_group) class DatasetWrapper(GroupWrapper): diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py new file mode 100644 index 0000000000..0eb211c8b0 --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -0,0 +1,234 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for string data handling.""" + +import subprocess + +import numpy as np +import pytest + +import iris +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube +from iris.fileformats.netcdf import _thread_safe_nc +from iris.tests import env_bin_path + +NX, N_STRLEN = 3, 64 +TEST_STRINGS = ["Münster", "London", "Amsterdam"] +TEST_COORD_VALS = ["bun", "éclair", "sandwich"] + +# VARS_COORDS_SHARE_STRING_DIM = True +VARS_COORDS_SHARE_STRING_DIM = False +if VARS_COORDS_SHARE_STRING_DIM: + TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one + + +# Ensure all tests run with "split attrs" turned on. +@pytest.fixture(scope="module", autouse=True) +def enable_split_attrs(): + with iris.FUTURE.context(save_split_attrs=True): + yield + + +def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"): + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +def convert_bytesarray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + +INCLUDE_COORD = True +# INCLUDE_COORD = False + +INCLUDE_NUMERIC_AUXCOORD = True +# INCLUDE_NUMERIC_AUXCOORD = False + + +def make_testfile(filepath, chararray, coordarray, encoding_str=None): + ds = _thread_safe_nc.DatasetWrapper(filepath, "w") + try: + ds.createDimension("x", NX) + ds.createDimension("nstr", N_STRLEN) + vx = ds.createVariable("x", int, dimensions=("x")) + vx[:] = np.arange(NX) + if INCLUDE_COORD: + ds.createDimension("nstr2", N_STRLEN) + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2", + ), + ) + v_co[:] = coordarray + if encoding_str is not None: + v_co._Encoding = encoding_str + if INCLUDE_NUMERIC_AUXCOORD: + v_num = ds.createVariable( + "v_num", + float, + dimensions=("x",), + ) + v_num[:] = np.arange(NX) + v = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v[:] = chararray + if encoding_str is not None: + v._Encoding = encoding_str + if INCLUDE_COORD: + coords_str = "v_co" + if INCLUDE_NUMERIC_AUXCOORD: + coords_str += " v_num" + v.coordinates = coords_str + finally: + ds.close() + + +def make_testcube( + dataarray, + coordarray, # for now, these are always *string* arrays + encoding_str: str | None = None, +): + cube = Cube(dataarray, var_name="v") + cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0) + if encoding_str is not None: + cube.attributes["_Encoding"] = encoding_str + if INCLUDE_COORD: + co_x = AuxCoord(coordarray, var_name="v_co") + if encoding_str is not None: + co_x.attributes["_Encoding"] = encoding_str + cube.add_aux_coord(co_x, 0) + return cube + + +NCDUMP_PATHSTR = str(env_bin_path("ncdump")) + + +def ncdump(nc_path: str, *args): + """Call ncdump to print a dump of a file.""" + call_args = [NCDUMP_PATHSTR, nc_path] + list(*args) + subprocess.run(call_args, check=True) + + +def show_result(filepath): + print(f"File {filepath}") + print("NCDUMP:") + ncdump(filepath) + # with nc.Dataset(filepath, "r") as ds: + # v = ds.variables["v"] + # print("\n----\nNetcdf data readback (basic)") + # try: + # print(repr(v[:])) + # except UnicodeDecodeError as err: + # print(repr(err)) + # print("..raw:") + # v.set_auto_chartostring(False) + # print(repr(v[:])) + print("\nAs iris cube..") + try: + iris.loading.LOAD_PROBLEMS.reset() + cube = iris.load_cube(filepath) + print(cube) + if iris.loading.LOAD_PROBLEMS.problems: + print(iris.loading.LOAD_PROBLEMS) + print( + "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format()) + ) + print("-data-") + print(repr(cube.data)) + print("-numeric auxcoord data-") + print(repr(cube.coord("x").points)) + if INCLUDE_COORD: + print("-string auxcoord data-") + try: + print(repr(cube.coord("v_co").points)) + except Exception as err2: + print(repr(err2)) + except UnicodeDecodeError as err: + print(repr(err)) + + +@pytest.fixture(scope="session") +def save_dir(tmp_path_factory): + return tmp_path_factory.mktemp("save_files") + + +# TODO: the tests don't test things properly yet, they just exercise the code and print +# things for manual debugging. +tsts = ( + None, + "ascii", + "utf-8", + "utf-32", +) +# tsts = ("utf-8",) +# tsts = ("utf-8", "utf-32",) +# tsts = ("utf-32",) +# tsts = ("utf-8", "ascii", "utf-8") + + +@pytest.mark.parametrize("encoding", tsts) +def test_load_encodings(encoding, save_dir): + # small change + print(f"\n=========\nTesting encoding: {encoding}") + filepath = save_dir / f"tmp_load_{str(encoding)}.nc" + do_as = encoding + if encoding != "utf-32": + do_as = "utf-8" + TEST_CHARARRAY = convert_strings_to_chararray( + TEST_STRINGS, N_STRLEN, encoding=do_as + ) + TEST_COORDARRAY = convert_strings_to_chararray( + TEST_COORD_VALS, N_STRLEN, encoding=do_as + ) + make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) + show_result(filepath) + + +@pytest.mark.parametrize("encoding", tsts) +def test_save_encodings(encoding, save_dir): + cube = make_testcube( + dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding + ) + print(cube) + filepath = save_dir / f"tmp_save_{str(encoding)}.nc" + if encoding == "ascii": + with pytest.raises( + UnicodeEncodeError, + match="'ascii' codec can't encode character.*not in range", + ): + iris.save(cube, filepath) + else: + iris.save(cube, filepath) + show_result(filepath) diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt new file mode 100644 index 0000000000..bab04aa0c4 --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -0,0 +1,18 @@ + +forms in files: + * char chardata(dim1, dim2, strlen_xx); # char data + * string data(dim1, dim2); + +forms in numpy: + * np.ndarray(dtype="S1") # char data + * np.ndarray(dtype="Snn") # char data + * np.ndarray(dtype="Unn") # strings + * np.ndarray(dtype="") + +possibilities in createVariable: +""" + The datatype can be a numpy datatype object, or a string that describes a numpy dtype object ... + datatype can also be a CompoundType instance (for a structured, or compound array), a VLType instance (for a variable-length array), +** or the python str builtin (for a variable-length string array). +** Numpy string and unicode datatypes with length greater than one are aliases for str. +""" diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py new file mode 100644 index 0000000000..8b449c5912 --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -0,0 +1,14 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module.""" + +# import numpy as np +# import pytest +# +# from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset + + +class TestEncodedDataset: + """Test how GRIB_PARAM attributes convert to strings for storage in netcdf files.""" From fe1e22d586be798ad442fb793b9ac06ba1049627 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 3 Dec 2025 18:59:43 +0000 Subject: [PATCH 2/7] Various notes, choices + changes: Beginnings of encoded-dataset testing. --- .../netcdf/_bytecoding_datasets.py | 155 ++++++++---- .../integration/netcdf/test_chararrays.py | 7 +- .../fileformats/netcdf/encoding_tests.txt | 164 +++++++++++++ .../netcdf/test_bytecoding_datasets.py | 223 +++++++++++++++++- .../unit/fileformats/netcdf/test_nc_dtypes.py | 96 ++++++++ 5 files changed, 595 insertions(+), 50 deletions(-) create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 41e801d103..353f14d538 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -41,6 +41,8 @@ """ import codecs +import contextlib +import threading import warnings import numpy as np @@ -49,17 +51,18 @@ def decode_bytesarray_to_stringarray( - byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None + byte_array: np.ndarray, encoding: str, string_width: int ) -> np.ndarray: """Convert an array of bytes to an array of strings, with one less dimension. N.B. for now at least, we assume the string dim is **always the last one**. If 'string_width' is not given, it is set to the final dimension of 'byte_array'. """ + if np.ma.isMaskedArray(byte_array): + # netCDF4-python sees zeros as "missing" -- we don't need or want that + byte_array = byte_array.data bytes_shape = byte_array.shape var_shape = bytes_shape[:-1] - if string_width is None: - string_width = bytes_shape[-1] string_dtype = f"U{string_width}" result = np.empty(var_shape, dtype=string_dtype) for ndindex in np.ndindex(var_shape): @@ -70,16 +73,25 @@ def decode_bytesarray_to_stringarray( return result -def encode_stringarray_as_bytearray( +# +# TODO: remove? +# this older version is "overly flexible", less efficient and not needed here. +# +def flexi_encode_stringarray_as_bytearray( data: np.ndarray, encoding=None, string_dimension_length: int | None = None ) -> np.ndarray: """Encode strings as bytearray. Note: if 'string_dimension_length' is not given (None), it is set to the longest - encoded bytes element. If 'string_dimension_length' is specified, the last array + encoded bytes element, **OR** the dtype size, if that is greater. + If 'string_dimension_length' is specified, the last array dimension is set to this and content strings are truncated or extended as required. """ + if np.ma.isMaskedArray(data): + # netCDF4-python sees zeros as "missing" -- we don't need or want that + data = data.data element_shape = data.shape + # Encode all the strings + see which is longest max_length = 1 # this is a MINIMUM - i.e. not zero! data_elements = np.zeros(element_shape, dtype=object) for index in np.ndindex(element_shape): @@ -90,10 +102,15 @@ def encode_stringarray_as_bytearray( max_length = element_length if string_dimension_length is None: + # If the string length was not specified, it is the maximum encoded length + # (n-bytes), **or** the dtype string-length, if greater. string_dimension_length = max_length + array_string_length = int(str(data.dtype)[2:]) # Yuck. No better public way? + if array_string_length > string_dimension_length: + string_dimension_length = array_string_length - # We already encoded all the strings, but stored them in an object-array as - # we didn't yet know the fixed byte-length to convert to. + # We maybe *already* encoded all the strings above, but stored them in an + # object-array as we didn't yet know the fixed byte-length to convert to. # Now convert to a fixed-width byte array with an extra string-length dimension result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") right_pad = b"\0" * string_dimension_length @@ -105,58 +122,98 @@ def encode_stringarray_as_bytearray( return result -DEFAULT_ENCODING = "utf-8" +def encode_stringarray_as_bytearray( + data: np.ndarray, encoding: str, string_dimension_length: int +) -> np.ndarray: + """Encode strings as a bytes array.""" + element_shape = data.shape + result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") + right_pad = b"\0" * string_dimension_length + for index in np.ndindex(element_shape): + bytes = data[index].encode(encoding=encoding) + # It's all a bit nasty ... + bytes = (bytes + right_pad)[:string_dimension_length] + result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] + + return result + + +class NetcdfStringDecodeSetting(threading.local): + def __init__(self, perform_encoding: bool = True): + self.set(perform_encoding) + + def set(self, perform_encoding: bool): + self.perform_encoding = perform_encoding + + def __bool__(self): + return self.perform_encoding + + @contextlib.contextmanager + def context(self, perform_encoding: bool): + old_setting = self.perform_encoding + self.perform_encoding = perform_encoding + yield + self.perform_encoding = old_setting + + +DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting() +DEFAULT_READ_ENCODING = "utf-8" +DEFAULT_WRITE_ENCODING = "ascii" class EncodedVariable(VariableWrapper): """A variable wrapper that translates variable data according to byte encodings.""" def __getitem__(self, keys): - if self.is_chardata(): - super().set_auto_chartostring(False) + if self._is_chardata(): + # N.B. we never need to UNset this, as we totally control it + self._contained_instance.set_auto_chartostring(False) data = super().__getitem__(keys) - if self.is_chardata(): - encoding = self.get_byte_encoding() - strlen = self.get_string_length() - data = decode_bytesarray_to_stringarray(data, encoding, strlen) + if DECODE_TO_STRINGS_ON_READ and self._is_chardata(): + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice + strlen = self._get_string_length() + try: + data = decode_bytesarray_to_stringarray(data, encoding, strlen) + except UnicodeDecodeError as err: + msg = ( + f"Character data in variable {self.name!r} could not be decoded" + f"with the {encoding!r} encoding. This can be fixed by setting the " + "variable '_Encoding' attribute to suit the content." + ) + raise ValueError(msg) from err return data def __setitem__(self, keys, data): - if self.is_chardata(): - encoding = self.get_byte_encoding() - strlen = self.get_string_length() - if encoding is not None: - data = encode_stringarray_as_bytearray(data, encoding, strlen) - else: + if self._is_chardata(): + # N.B. we never need to UNset this, as we totally control it + self._contained_instance.set_auto_chartostring(False) + + encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING + # N.B. typically, write encoding default is "ascii" --> fails bad content + if data.dtype.kind == "U": try: - # Check if all characters are valid ascii - data = encode_stringarray_as_bytearray(data, "ascii", strlen) - except UnicodeEncodeError: - data = encode_stringarray_as_bytearray( - data, DEFAULT_ENCODING, strlen - ) - # As this was necessary, record the new encoding on the variable - self.set_ncattr("_Encoding", DEFAULT_ENCODING) + strlen = self._get_string_length() + data = encode_stringarray_as_bytearray(data, encoding, strlen) + except UnicodeEncodeError as err: msg = ( - f"Non-ascii data written to label variable {self.name}. " - f"Applied {DEFAULT_ENCODING!r} encoding, " - f"and set attribute _Encoding={DEFAULT_ENCODING!r}." + f"String data written to netcdf character variable {self.name!r} " + f"could not be represented in encoding {encoding!r}. This can be " + "fixed by setting a suitable variable '_Encoding' attribute, " + 'e.g. ._Encoding="UTF-8".' ) - warnings.warn(msg, UserWarning) - - super().set_auto_chartostring(False) + raise ValueError(msg) from err super().__setitem__(keys, data) - def is_chardata(self): + def _is_chardata(self): return np.issubdtype(self.dtype, np.bytes_) - def get_encoding(self) -> str | None: - """Get the effective byte encoding to be used for this variable.""" - # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data + def _get_encoding(self) -> str | None: + """Get the byte encoding defined for this variable (or None).""" result = getattr(self, "_Encoding", None) if result is not None: try: @@ -165,18 +222,32 @@ def get_encoding(self) -> str | None: # NOTE: if encoding does not suit data, errors can occur. # For example, _Encoding = "ascii", with non-ascii content. except LookupError: - # Replace some invalid setting with "safe"(ish) fallback. + # Unrecognised encoding name : handle this as just a warning msg = f"Unknown encoding for variable {self.name!r}: {result!r}" warnings.warn(msg, UserWarning) return result - def get_string_length(self): - """Return the string-length defined for this variable (or None).""" - return getattr(self, "iris_string_length", None) + def _get_string_length(self): + """Return the string-length defined for this variable.""" + if not hasattr(self, "_strlen"): + # Work out the string length from the parent dataset dimensions. + strlen = self.group().dimensions[self.dimensions[-1]].size + # Cache this on the variable -- but not as a netcdf attribute (!) + self.__dict__["_strlen"] = strlen + + return self._strlen + + def set_auto_chartostring(self, onoff: bool): + msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type." + raise TypeError(msg) class EncodedDataset(DatasetWrapper): """A specialised DatasetWrapper whose variables perform byte encoding.""" VAR_WRAPPER_CLS = EncodedVariable + + def set_auto_chartostring(self, onoff: bool): + msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type." + raise TypeError(msg) diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index 0eb211c8b0..4414444733 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -137,8 +137,11 @@ def make_testcube( def ncdump(nc_path: str, *args): """Call ncdump to print a dump of a file.""" - call_args = [NCDUMP_PATHSTR, nc_path] + list(*args) - subprocess.run(call_args, check=True) + call_args = [NCDUMP_PATHSTR, nc_path] + list(args) + bytes = subprocess.check_output(call_args) + text = bytes.decode("utf-8") + print(text) + return text def show_result(filepath): diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt index bab04aa0c4..e77427cd63 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -1,8 +1,95 @@ +=========== +Outstanding Qs +* What would we like to do with all this IN IRIS?? + - generally present as string arrays (Uxx) + - existing scheme of naming dims for length + re-using is quite cunning! + - choice of seeing actual character arrays as alternative to string conversions? + +* string length handling for load/save/roundtrip + - on SAVE, we need some control so we can create files which are compatible, + irrespective of the data (which currently we are not doing) + - ALSO this is wanted to ensure that multiple vars (e.g. string cubes or string coords) + will share the string dim -- instead of creating arbitrary different ones + - presumably, if encoding blows the max-len, we must get a warning/error + + - on LOAD, we may want to *capture* the actual original string dim length, so it can be + re-created on save (by some scheme, as per previous) -- i.e. enable roundtripping. + I don't really want to preserve the name of the string dim, but this could be a + slightly tender point. To consider also : the impact of this on the non-equivalence + of loaded cubes, if we use actual *attributes* to carry this info (see below). + - **if not** : just load data + convert to string arrays as seems best + - this will also lead to incompatible cubes. + + - on SAVE, in the absence of strlen-controls, what is a reasonable default choice? + - take longest encoded + - set nbytes = NEXPAND(encoding) * nchars + - sensible values would depend on the encoding... + : ascii -> 1 + : utf-8 -> 1 or 4 ??? + : utf-16 -> 2 or 4 ??? + : utf-32 -> 4 + + - on LOAD, in absence of strlen controls, how do we choose the result DTYPE (i.e. character length)? + - again, may depend on the encoding: + : ascii = "U" + : UTF-8 = "U" + : UTF-16 = "U" + : UTF-32 = "U" + - N.B. these are ll at least "safe" - i.e. won't lose characters + + +separately from these, there is the question of how the controls affect "normal" +cube operations. + - the easiest approach is to define a "special" attribute, + which can be set on any cube/component + - using the dtype-length of the data would be *possible*, in conjunction with the + above-proposed "default rules" for choosing strlen from the dtype. + But this might not round-trip in all cases. + +within the actual data arrays + - we can't really expect any different to what numpy does + - that is, the dtype-length of any element <= that of the array (and not ==) + this may be tricky, but we can't easily prevent it. + >>> a = np.array(['', 'a', 'bb']) + >>> a + array(['', 'a', 'bb'], dtype='>> a[0].dtype + dtype('>> a[1].dtype + dtype('>> a[2].dtype + dtype('>> a.dtype + dtype('>> + - likewise, we can't assign without possible truncation. + If you **want** to expand the supported width, can use ".astype()" first ? + + +======================== +========================= forms in files: * char chardata(dim1, dim2, strlen_xx); # char data * string data(dim1, dim2); +netcdf types: +(netcdf docs terms) + NC_BYTE 8-bit signed integer + NC_UBYTE 8-bit unsigned integer + NC_CHAR 8-bit character + NC_STRING variable length character string + +***NOTE*** there is no NC_UCHAR or "unsigned char" type + + +relevant numpy base types (scalar dtypes): + * "S" bytes : np.bytes_ == np.int8 + * "B" unsigned bytes : np.ubyte == np.uint8 + * 'i' ints : np.int_ + * 'u' unsigned ints : np.int_ + * "U" unicode string : np.str_ + forms in numpy: * np.ndarray(dtype="S1") # char data * np.ndarray(dtype="Snn") # char data @@ -16,3 +103,80 @@ possibilities in createVariable: ** or the python str builtin (for a variable-length string array). ** Numpy string and unicode datatypes with length greater than one are aliases for str. """ + +test types: + "i1" : np.int8 + "u1" : np.uint8 + "S1" : np.byte_ + "U1" : np.str_ + "S" : + "U" : with/without non-ascii content + +save all these to files... +outputs from "test_nc_dtypes.py" test run: + SPEC:i1 SAVED-AS:int8 byte RELOAD-AS:int8 + SPEC:u1 SAVED-AS:uint8 ubyte RELOAD-AS:uint8 + SPEC:S1 SAVED-AS:|S1 char RELOAD-AS: () + SPEC:U1 SAVED-AS: EncodedDataset: + """Create a test EncodedDataset linked to an actual file. + + * strlen becomes the string dimension (i.e. a number of *bytes*) + * a variable "vxs" is created + * If 'encoding' is given, the "vxs::_Encoding" attribute is created with this value + """ + ds = EncodedDataset(path, "w") + ds.createDimension("x", 3) + ds.createDimension("strlen", strlen) + v = ds.createVariable("vxs", "S1", ("x", "strlen")) + if encoding is not None: + v.setncattr("_Encoding", encoding) + return ds + + +def fetch_undecoded_var(path, varname): + # Open a path as a "normal" dataset, and return a given variable. + ds_normal = DatasetWrapper(path) + ds_normal._contained_instance.set_auto_chartostring(False) + v = ds_normal.variables[varname] + # Return a variable, rather than its data, so we can check attributes etc. + return v + + +class TestWriteStrings: + """Test how string data is saved to a file.""" + + def test_write_strings(self, encoding, tempdir): + # Create a dataset with the variable + path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc" + + if encoding in [None, "ascii"]: + writedata = samples_3_ascii + write_encoding = "ascii" + else: + writedata = samples_3_nonascii + write_encoding = encoding + + writedata = writedata.copy() # just for safety? + strlen = strings_maxbytes(writedata, write_encoding) + + ds_encoded = make_encoded_dataset(path, strlen, encoding) + v = ds_encoded.variables["vxs"] + + # Effectively, checks that we *can* write strings + v[:] = writedata + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + v = fetch_undecoded_var(path, "vxs") + + # Check that the raw result is as expected + bytes_result = v[:] + expected = encode_stringarray_as_bytearray(writedata, write_encoding, strlen) + assert ( + bytes_result.shape == expected.shape + and bytes_result.dtype == expected.dtype + and np.all(bytes_result == expected) + ) + + # Check that the "_Encoding" property is also as expected + result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None + assert result_attr == encoding + + def test_scalar(self, tempdir): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / "test_writestrings_scalar.nc" + + ds_encoded = make_encoded_dataset(path, strlen=5) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + # Checks that we *can* write a string + v[:] = np.array("stuff", dtype=str) + + # Close, re-open as an "ordinary" dataset, and check the raw content. + ds_encoded.close() + v = fetch_undecoded_var(path, "v0_scalar") + result = v[:] + + # Check that the raw result is as expected + assert ( + result.shape == (5,) + and result.dtype == " Date: Fri, 5 Dec 2025 12:51:04 +0000 Subject: [PATCH 3/7] Replace use of encoding functions with test-specific function: Test for overlength writes. --- .../netcdf/_bytecoding_datasets.py | 6 +- .../fileformats/netcdf/encoding_tests.txt | 15 +- .../netcdf/test_bytecoding_datasets.py | 194 ++++++++++++------ 3 files changed, 147 insertions(+), 68 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 353f14d538..62e1dd2ab7 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -123,9 +123,10 @@ def flexi_encode_stringarray_as_bytearray( def encode_stringarray_as_bytearray( - data: np.ndarray, encoding: str, string_dimension_length: int + data: np.typing.ArrayLike, encoding: str, string_dimension_length: int ) -> np.ndarray: """Encode strings as a bytes array.""" + data = np.asanyarray(data) element_shape = data.shape result = np.zeros(element_shape + (string_dimension_length,), dtype="S1") right_pad = b"\0" * string_dimension_length @@ -179,7 +180,7 @@ def __getitem__(self, keys): data = decode_bytesarray_to_stringarray(data, encoding, strlen) except UnicodeDecodeError as err: msg = ( - f"Character data in variable {self.name!r} could not be decoded" + f"Character data in variable {self.name!r} could not be decoded " f"with the {encoding!r} encoding. This can be fixed by setting the " "variable '_Encoding' attribute to suit the content." ) @@ -188,6 +189,7 @@ def __getitem__(self, keys): return data def __setitem__(self, keys, data): + data = np.asanyarray(data) if self._is_chardata(): # N.B. we never need to UNset this, as we totally control it self._contained_instance.set_auto_chartostring(False) diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt index e77427cd63..5fa021ccdd 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -146,12 +146,17 @@ Then, as regards the _Encoding .. TO TEST... ========== -create a dataset + write char data - - X assign different encodings: makes no difference +NOTE on length control: + - not an API thing, it's implicit from when you create a variable + - this also applies to how it loads back + - BUT here there may be scope for a control attribute : -create a dataset + write STRING data - - X encoding=(ascii, utf-8, utf-32, None) - - X withnonascii=(T, F) ++++ create a dataset + write char data ++++ - X assign different encodings: makes no difference + ++++ create a dataset + write STRING data ++++ - X encoding=(ascii, utf-8, utf-32, None) ++++ - X withnonascii=(T, F) - X length=(long, short, none) read string data diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 092da19a00..411212b973 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -9,11 +9,7 @@ import numpy as np import pytest -from iris.fileformats.netcdf._bytecoding_datasets import ( - EncodedDataset, - encode_stringarray_as_bytearray, - flexi_encode_stringarray_as_bytearray, -) +from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper encoding_options = [None, "ascii", "utf-8", "utf-32"] @@ -66,8 +62,92 @@ def fetch_undecoded_var(path, varname): return v +def check_raw_content(path, varname, expected_byte_array): + v = fetch_undecoded_var(path, varname) + bytes_result = v[:] + assert ( + bytes_result.shape == expected_byte_array.shape + and bytes_result.dtype == expected_byte_array.dtype + and np.all(bytes_result == expected_byte_array) + ) + + +def _make_bytearray_inner(data, encoding): + # Convert to a (list of [lists of..]) strings or bytes to a + # (list of [lists of..]) length-1 bytes with an extra dimension. + if isinstance(data, str): + # Convert input strings to bytes + data = data.encode(encoding) + if isinstance(data, bytes): + # iterate over bytes to get a sequence of length-1 bytes (what np.array wants) + result = [data[i : i + 1] for i in range(len(data))] + else: + # If not string/bytes, expect the input to be a list. + # N.B. the recursion is inefficient, but we don't care about that here + result = [_make_bytearray_inner(part, encoding) for part in data] + return result + + +def make_bytearray(data, encoding="ascii"): + """Convert bytes or lists of bytes into a numpy byte array. + + This is largely to avoid using "encode_stringarray_as_bytearray", since we don't + want to depend on that when we should be testing it. + So, it mostly replicates the function of that, but it does also support bytes in the + input, and it automatically finds + applies the maximum bytes-lengths in the input. + """ + # First, Convert to a (list of [lists of]..) length-1 bytes objects + data = _make_bytearray_inner(data, encoding) + + # Numbers of bytes in the inner dimension are the lengths of bytes/strings input, + # so they aren't all the same. + # To enable array conversion, we fix that by expanding all to the max length + + def get_maxlen(data): + # Find the maximum number of bytes in the inner dimension. + if not isinstance(data, list): + # Inner bytes object + assert isinstance(data, bytes) + longest = len(data) + else: + # We have a list: either a list of bytes, or a list of lists. + if len(data) == 0 or not isinstance(data[0], list): + # inner-most list, should contain bytes if anything + assert len(data) == 0 or isinstance(data[0], bytes) + # return n-bytes + longest = len(data) + else: + # list of lists: return max length of sub-lists + longest = max(get_maxlen(part) for part in data) + return longest + + maxlen = get_maxlen(data) + + def extend_all_to_maxlen(data, length, filler=b"\0"): + # Extend each "innermost" list (of single bytes) to the required length + if isinstance(data, list): + if len(data) == 0 or not isinstance(data[0], list): + # Pad all the inner-most lists to the required number of elements + n_extra = length - len(data) + if n_extra > 0: + data = data + [filler] * n_extra + else: + data = [extend_all_to_maxlen(part, length, filler) for part in data] + return data + + data = extend_all_to_maxlen(data, maxlen) + # We should now be able to create an array of single bytes. + result = np.array(data) + assert result.dtype == " Date: Fri, 5 Dec 2025 14:47:54 +0000 Subject: [PATCH 4/7] Radically simplify 'make_bytesarray', by using a known specified bytewidth. --- .../netcdf/test_bytecoding_datasets.py | 76 ++++++------------- 1 file changed, 22 insertions(+), 54 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 411212b973..9ef354f850 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -72,7 +72,7 @@ def check_raw_content(path, varname, expected_byte_array): ) -def _make_bytearray_inner(data, encoding): +def _make_bytearray_inner(data, bytewidth, encoding): # Convert to a (list of [lists of..]) strings or bytes to a # (list of [lists of..]) length-1 bytes with an extra dimension. if isinstance(data, str): @@ -81,61 +81,25 @@ def _make_bytearray_inner(data, encoding): if isinstance(data, bytes): # iterate over bytes to get a sequence of length-1 bytes (what np.array wants) result = [data[i : i + 1] for i in range(len(data))] + # pad or truncate everything to the required bytewidth + result = (result + [b"\0"] * bytewidth)[:bytewidth] else: # If not string/bytes, expect the input to be a list. # N.B. the recursion is inefficient, but we don't care about that here - result = [_make_bytearray_inner(part, encoding) for part in data] + result = [_make_bytearray_inner(part, bytewidth, encoding) for part in data] return result -def make_bytearray(data, encoding="ascii"): +def make_bytearray(data, bytewidth, encoding="ascii"): """Convert bytes or lists of bytes into a numpy byte array. This is largely to avoid using "encode_stringarray_as_bytearray", since we don't want to depend on that when we should be testing it. So, it mostly replicates the function of that, but it does also support bytes in the - input, and it automatically finds + applies the maximum bytes-lengths in the input. + input. """ # First, Convert to a (list of [lists of]..) length-1 bytes objects - data = _make_bytearray_inner(data, encoding) - - # Numbers of bytes in the inner dimension are the lengths of bytes/strings input, - # so they aren't all the same. - # To enable array conversion, we fix that by expanding all to the max length - - def get_maxlen(data): - # Find the maximum number of bytes in the inner dimension. - if not isinstance(data, list): - # Inner bytes object - assert isinstance(data, bytes) - longest = len(data) - else: - # We have a list: either a list of bytes, or a list of lists. - if len(data) == 0 or not isinstance(data[0], list): - # inner-most list, should contain bytes if anything - assert len(data) == 0 or isinstance(data[0], bytes) - # return n-bytes - longest = len(data) - else: - # list of lists: return max length of sub-lists - longest = max(get_maxlen(part) for part in data) - return longest - - maxlen = get_maxlen(data) - - def extend_all_to_maxlen(data, length, filler=b"\0"): - # Extend each "innermost" list (of single bytes) to the required length - if isinstance(data, list): - if len(data) == 0 or not isinstance(data[0], list): - # Pad all the inner-most lists to the required number of elements - n_extra = length - len(data) - if n_extra > 0: - data = data + [filler] * n_extra - else: - data = [extend_all_to_maxlen(part, length, filler) for part in data] - return data - - data = extend_all_to_maxlen(data, maxlen) + data = _make_bytearray_inner(data, bytewidth, encoding) # We should now be able to create an array of single bytes. result = np.array(data) assert result.dtype == " Date: Fri, 5 Dec 2025 16:23:55 +0000 Subject: [PATCH 5/7] Add read tests. --- .../netcdf/_bytecoding_datasets.py | 38 +++- .../netcdf/test_bytecoding_datasets.py | 165 ++++++++++++++++-- 2 files changed, 184 insertions(+), 19 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 62e1dd2ab7..3bdc799d7f 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -175,7 +175,7 @@ def __getitem__(self, keys): if DECODE_TO_STRINGS_ON_READ and self._is_chardata(): encoding = self._get_encoding() or DEFAULT_READ_ENCODING # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice - strlen = self._get_string_length() + strlen = self._get_string_width() try: data = decode_bytesarray_to_stringarray(data, encoding, strlen) except UnicodeDecodeError as err: @@ -194,11 +194,11 @@ def __setitem__(self, keys, data): # N.B. we never need to UNset this, as we totally control it self._contained_instance.set_auto_chartostring(False) - encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING # N.B. typically, write encoding default is "ascii" --> fails bad content if data.dtype.kind == "U": try: - strlen = self._get_string_length() + encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING + strlen = self._get_byte_width() data = encode_stringarray_as_bytearray(data, encoding, strlen) except UnicodeEncodeError as err: msg = ( @@ -230,12 +230,36 @@ def _get_encoding(self) -> str | None: return result - def _get_string_length(self): + def _get_byte_width(self) -> int | None: + if not hasattr(self, "_bytewidth"): + n_bytes = self.group().dimensions[self.dimensions[-1]].size + # Cache this length control on the variable -- but not as a netcdf attribute + self.__dict__["_bytewidth"] = n_bytes + + return self.__dict__["_bytewidth"] + + def _get_string_width(self): """Return the string-length defined for this variable.""" if not hasattr(self, "_strlen"): - # Work out the string length from the parent dataset dimensions. - strlen = self.group().dimensions[self.dimensions[-1]].size - # Cache this on the variable -- but not as a netcdf attribute (!) + if hasattr(self, "iris_string_width"): + strlen = self.get_ncattr("iris_string_width") + else: + # Work out the actual byte width from the parent dataset dimensions. + strlen = self._get_byte_width() + # Convert the string dimension length (i.e. bytes) to a sufficiently-long + # string width, depending on the encoding used. + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # regularise the name for comparison with recognised ones + encoding = codecs.lookup(encoding).name + if "utf-16" in encoding: + # Each char needs at least 2 bytes -- including a terminator char + strlen = (strlen // 2) - 1 + elif "utf-32" in encoding: + # Each char needs exactly 4 bytes -- including a terminator char + strlen = (strlen // 4) - 1 + # "ELSE": assume there can be (at most) as many chars as bytes + + # Cache this length control on the variable -- but not as a netcdf attribute self.__dict__["_strlen"] = strlen return self._strlen diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py index 9ef354f850..5df511103f 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py @@ -9,7 +9,10 @@ import numpy as np import pytest -from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset +from iris.fileformats.netcdf._bytecoding_datasets import ( + DECODE_TO_STRINGS_ON_READ, + EncodedDataset, +) from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper encoding_options = [None, "ascii", "utf-8", "utf-32"] @@ -62,14 +65,17 @@ def fetch_undecoded_var(path, varname): return v +def check_array_matching(arr1, arr2): + """Check for arrays matching shape, dtype and content.""" + assert ( + arr1.shape == arr2.shape and arr1.dtype == arr2.dtype and np.all(arr1 == arr2) + ) + + def check_raw_content(path, varname, expected_byte_array): v = fetch_undecoded_var(path, varname) bytes_result = v[:] - assert ( - bytes_result.shape == expected_byte_array.shape - and bytes_result.dtype == expected_byte_array.dtype - and np.all(bytes_result == expected_byte_array) - ) + check_array_matching(bytes_result, expected_byte_array) def _make_bytearray_inner(data, bytewidth, encoding): @@ -102,7 +108,7 @@ def make_bytearray(data, bytewidth, encoding="ascii"): data = _make_bytearray_inner(data, bytewidth, encoding) # We should now be able to create an array of single bytes. result = np.array(data) - assert result.dtype == " string array + result = v[:] + expected = write_strings + if encoding == "utf-8": + # In this case, with the given non-ascii sample data, the + # "default minimum string length" is overestimated. + assert strlen == 7 and result.dtype == "U7" + # correct the result dtype to pass the write_strings comparison below + truncated_result = result.astype("U4") + # Also check that content is the same (i.e. not actually truncated) + assert np.all(truncated_result == result) + result = truncated_result + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = write_bytes + + check_array_matching(result, expected) + + def test_scalar(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has *only* the string dimension. + path = tempdir / f"test_read_scalar_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",)) + + data_string = "stuff" + data_bytes = make_bytearray(data_string, 5) + + # Checks that we *can* write a string + v[:] = data_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(data_string) + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = data_bytes + + check_array_matching(result, expected) + + def test_multidim(self, tempdir, readmode): + # Like 'test_write_strings', but the variable has additional dimensions. + path = tempdir / f"test_read_multidim_{readmode}.nc" + + strlen = 5 + ds_encoded = make_encoded_dataset(path, strlen=strlen) + ds_encoded.createDimension("y", 2) + v = ds_encoded.createVariable( + "vyxn", + "S1", + ( + "y", + "x", + "strlen", + ), + ) + + # Check that we *can* write a multidimensional string array + test_strings = [ + ["one", "n", ""], + ["two", "xxxxx", "four"], + ] + test_bytes = make_bytearray(test_strings, strlen) + v[:] = test_bytes + + if readmode == "strings": + # Test "normal" read --> string array + result = v[:] + expected = np.array(test_strings) + else: + # Test "raw" read --> byte array + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] + expected = test_bytes + + check_array_matching(result, expected) + + def test_read_encoding_failure(self, tempdir, readmode): + path = tempdir / f"test_read_encoding_failure_{readmode}.nc" + strlen = 10 + ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii") + v = ds.variables["vxs"] + test_utf8_bytes = make_bytearray( + samples_3_nonascii, bytewidth=strlen, encoding="utf-8" + ) + v[:] = test_utf8_bytes + + if readmode == "strings": + msg = ( + "Character data in variable 'vxs' could not be decoded " + "with the 'ascii' encoding." + ) + with pytest.raises(ValueError, match=msg): + v[:] + else: + with DECODE_TO_STRINGS_ON_READ.context(False): + result = v[:] # this ought to be ok! - def test_encodings(self, encoding): - pass + assert np.all(result == test_utf8_bytes) From 046183bb97e6d037c2ebee74a938f98834fc7753 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 5 Dec 2025 16:26:13 +0000 Subject: [PATCH 6/7] Remove iris width control (not in this layer). --- .../netcdf/_bytecoding_datasets.py | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py index 3bdc799d7f..5ed156f3ee 100644 --- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py +++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py @@ -241,23 +241,20 @@ def _get_byte_width(self) -> int | None: def _get_string_width(self): """Return the string-length defined for this variable.""" if not hasattr(self, "_strlen"): - if hasattr(self, "iris_string_width"): - strlen = self.get_ncattr("iris_string_width") - else: - # Work out the actual byte width from the parent dataset dimensions. - strlen = self._get_byte_width() - # Convert the string dimension length (i.e. bytes) to a sufficiently-long - # string width, depending on the encoding used. - encoding = self._get_encoding() or DEFAULT_READ_ENCODING - # regularise the name for comparison with recognised ones - encoding = codecs.lookup(encoding).name - if "utf-16" in encoding: - # Each char needs at least 2 bytes -- including a terminator char - strlen = (strlen // 2) - 1 - elif "utf-32" in encoding: - # Each char needs exactly 4 bytes -- including a terminator char - strlen = (strlen // 4) - 1 - # "ELSE": assume there can be (at most) as many chars as bytes + # Work out the actual byte width from the parent dataset dimensions. + strlen = self._get_byte_width() + # Convert the string dimension length (i.e. bytes) to a sufficiently-long + # string width, depending on the encoding used. + encoding = self._get_encoding() or DEFAULT_READ_ENCODING + # regularise the name for comparison with recognised ones + encoding = codecs.lookup(encoding).name + if "utf-16" in encoding: + # Each char needs at least 2 bytes -- including a terminator char + strlen = (strlen // 2) - 1 + elif "utf-32" in encoding: + # Each char needs exactly 4 bytes -- including a terminator char + strlen = (strlen // 4) - 1 + # "ELSE": assume there can be (at most) as many chars as bytes # Cache this length control on the variable -- but not as a netcdf attribute self.__dict__["_strlen"] = strlen From 2002c2a29a63751e07cd0e7d061c45c5031d10d4 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 5 Dec 2025 17:55:12 +0000 Subject: [PATCH 7/7] more notes --- .../fileformats/netcdf/encoding_tests.txt | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt index 5fa021ccdd..07a0bc3bcd 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt +++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt @@ -146,25 +146,21 @@ Then, as regards the _Encoding .. TO TEST... ========== -NOTE on length control: - - not an API thing, it's implicit from when you create a variable - - this also applies to how it loads back - - BUT here there may be scope for a control attribute : - +++ create a dataset + write char data +++ - X assign different encodings: makes no difference +++ create a dataset + write STRING data +++ - X encoding=(ascii, utf-8, utf-32, None) +++ - X withnonascii=(T, F) - - X length=(long, short, none) +XXXX - X length=(long, short, none) + ***deferred*** to layer above only -read string data - - X encoding=(ascii, utf-8, utf-32, None) - - X withnonascii=(T, F) ++++ read string data ++++ - X encoding=(ascii, utf-8, utf-32, None) ++++ - X withnonascii=(T, F) -read char data (with control) - - X different encodings: make no difference ++++ read char data (with control) ++++ - X different encodings: make no difference ==rethought== write strings @@ -185,3 +181,11 @@ write char data read char data - X encodings: don't matter +--- +NOTEs on length control: +not an API thing, it's implicit from when you create a variable +this also applies to how it loads back +BUT here there may be scope for a control attribute : + "iris_string_dim" - controls width on creation + reading back + +