From 959ac7975d2596e8ee38366f8ff932fc2a03b020 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sat, 25 Oct 2025 00:18:03 +0100 Subject: [PATCH 01/10] Initial tests. --- lib/iris/fileformats/cf.py | 2 +- .../integration/netcdf/test_chararrays.py | 112 ++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 lib/iris/tests/integration/netcdf/test_chararrays.py diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 2b6568c315..b65ab70792 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -802,7 +802,7 @@ def cf_label_data(self, cf_data_var): label_data = self[:] if ma.isMaskedArray(label_data): - label_data = label_data.filled() + label_data = label_data.filled(b"\0") # Determine whether we have a string-valued scalar label # i.e. a character variable that only has one dimension (the length of the string). diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py new file mode 100644 index 0000000000..feb93047dd --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -0,0 +1,112 @@ +import netCDF4 as nc +import numpy as np +import pytest + +import iris + +NX, N_STRLEN = 3, 64 +TEST_STRINGS = ["Münster", "London", "Amsterdam"] +TEST_COORD_VALS = ["bun", "éclair", "sandwich"] + + +def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +INCLUDE_COORD = True +# INCLUDE_COORD = False + + +def make_testfile(filepath, chararray, coordarray, encoding_str=None): + with nc.Dataset(filepath, "w") as ds: + ds.createDimension("x", NX) + ds.createDimension("nstr", N_STRLEN) + vx = ds.createVariable("x", int, dimensions=("x")) + vx[:] = np.arange(NX) + if INCLUDE_COORD: + ds.createDimension("nstr2", N_STRLEN) + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2", + ), + ) + v_co[:] = coordarray + if encoding_str is not None: + v_co._Encoding = encoding_str + v = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v[:] = chararray + if encoding_str is not None: + v._Encoding = encoding_str + if INCLUDE_COORD: + v.coordinates = "v_co" + + +def show_result(filepath): + from pp_utils import ncdump + + print(f"File {filepath}") + print("NCDUMP:") + ncdump(filepath, "") + # with nc.Dataset(filepath, "r") as ds: + # v = ds.variables["v"] + # print("\n----\nNetcdf data readback (basic)") + # try: + # print(repr(v[:])) + # except UnicodeDecodeError as err: + # print(repr(err)) + # print("..raw:") + # v.set_auto_chartostring(False) + # print(repr(v[:])) + print("\nAs iris cube..") + try: + cube = iris.load_cube(filepath) + print(cube) + if iris.loading.LOAD_PROBLEMS._problems: + print(iris.loading.LOAD_PROBLEMS) + print( + "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format()) + ) + print("-data-") + print(repr(cube.data)) + if INCLUDE_COORD: + print("-coord data-") + try: + print(repr(cube.coord("v_co").points)) + except Exception as err2: + print(repr(err2)) + except UnicodeDecodeError as err: + print(repr(err)) + + +# tsts = (None, "ascii", "utf-8", "utf-32",) +# tsts = ("utf-8",) +# tsts = ("utf-8", "utf-32",) +# tsts = ("utf-32",) +tsts = ("utf-8", "ascii", "utf-8") + + +@pytest.mark.parametrize("encoding", tsts) +def test_encodings(encoding): + print(f"\n=========\nTesting encoding: {encoding}") + filepath = f"tmp_{str(encoding)}.nc" + do_as = encoding + if encoding != "utf-32": + do_as = "utf-8" + TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as) + TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as) + make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) + show_result(filepath) From 814932eba65a43716acb161103a611b5010c77ed Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sat, 25 Oct 2025 01:22:30 +0100 Subject: [PATCH 02/10] Get 'create_cf_data_variable' to call 'create_generic_cf_array_var': Mostly working? Get 'create_cf_data_variable' to call 'create_generic_cf_array_var': Mostly working? --- .../fileformats/_nc_load_rules/helpers.py | 8 +- lib/iris/fileformats/netcdf/saver.py | 158 +++++++++--------- .../integration/netcdf/test_chararrays.py | 1 + 3 files changed, 85 insertions(+), 82 deletions(-) diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 35c2e96924..50e282db5f 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine): ), ) if problem is not None: - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Skipping disallowed global attribute '{attr_name}' (see above error)" ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] ################################################################################ @@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate( ) if problem is not None: coord_var_name = str(cf_coord_var.cf_name) - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Failed to create {coord_var_name} dimension coordinate:\n" f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead." ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] problem.handled = True _ = _add_or_capture( diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 5177749c07..bd4e87471f 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -759,7 +759,7 @@ def _create_cf_dimensions(self, cube, dimension_names, unlimited_dimensions=None # used for a different one pass else: - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) unlimited_dim_names.append(dim_name) for dim_name in dimension_names: @@ -990,12 +990,12 @@ def _add_aux_coords( ] # Include any relevant mesh location coordinates. - mesh: MeshXY | None = getattr(cube, "mesh") - mesh_location: str | None = getattr(cube, "location") + mesh: MeshXY | None = getattr(cube, "mesh") # type: ignore[annotation-unchecked] + mesh_location: str | None = getattr(cube, "location") # type: ignore[annotation-unchecked] if mesh and mesh_location: location_coords: MeshNodeCoords | MeshEdgeCoords | MeshFaceCoords = getattr( mesh, f"{mesh_location}_coords" - ) + ) # type: ignore[annotation-unchecked] coords_to_add.extend(list(location_coords)) return self._add_inner_related_vars( @@ -1365,7 +1365,7 @@ def record_dimension(names_list, dim_name, length, matching_coords=None): if dim_name is None: # Not already present : create a unique dimension name # from the coord. - dim_name = self._get_coord_variable_name(cube, coord) + dim_name = self._get_element_variable_name(cube, coord) # Disambiguate if it has the same name as an # existing dimension. # OR if it matches an existing file variable name. @@ -1541,38 +1541,14 @@ def _create_cf_bounds(self, coord, cf_var, cf_name, /, *, compression_kwargs=Non ) self._lazy_stream_data(data=bounds, cf_var=cf_var_bounds) - def _get_cube_variable_name(self, cube): - """Return a CF-netCDF variable name for the given cube. - - Parameters - ---------- - cube : :class:`iris.cube.Cube` - An instance of a cube for which a CF-netCDF variable - name is required. - - Returns - ------- - str - A CF-netCDF variable name as a string. - - """ - if cube.var_name is not None: - cf_name = cube.var_name - else: - # Convert to lower case and replace whitespace by underscores. - cf_name = "_".join(cube.name().lower().split()) - - cf_name = self.cf_valid_var_name(cf_name) - return cf_name - - def _get_coord_variable_name(self, cube_or_mesh, coord): - """Return a CF-netCDF variable name for a given coordinate-like element. + def _get_element_variable_name(self, cube_or_mesh, element): + """Return a CF-netCDF variable name for a given coordinate-like element, or cube. Parameters ---------- cube_or_mesh : :class:`iris.cube.Cube` or :class:`iris.mesh.MeshXY` The Cube or Mesh being saved to the netCDF file. - coord : :class:`iris.coords._DimensionalMetadata` + element : :class:`iris.coords._DimensionalMetadata` | :class:``iris.cube.Cube`` An instance of a coordinate (or similar), for which a CF-netCDF variable name is required. @@ -1592,17 +1568,21 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): cube = None mesh = cube_or_mesh - if coord.var_name is not None: - cf_name = coord.var_name + if element.var_name is not None: + cf_name = element.var_name + elif isinstance(element, Cube): + # Make name for a Cube without a var_name. + cf_name = "_".join(element.name().lower().split()) else: - name = coord.standard_name or coord.long_name + # Make name for a Coord-like element without a var_name + name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.coords(coord): - # It is a regular cube coordinate. + if cube is not None and cube.elements(element): + # It is a regular cube elementinate. # Auto-generate a name based on the dims. name = "" - for dim in cube.coord_dims(coord): + for dim in cube.coord_dims(element): name += f"dim{dim}" # Handle scalar coordinate (dims == ()). if not name: @@ -1616,8 +1596,8 @@ def _get_coord_variable_name(self, cube_or_mesh, coord): # At present, a location-coord cannot be nameless, as the # MeshXY code relies on guess_coord_axis. - assert isinstance(coord, Connectivity) - location = coord.cf_role.split("_")[0] + assert isinstance(element, Connectivity) + location = element.cf_role.split("_")[0] location_dim_attr = f"{location}_dimension" name = getattr(mesh, location_dim_attr) @@ -1693,6 +1673,8 @@ def _create_mesh(self, mesh): return cf_mesh_name def _set_cf_var_attributes(self, cf_var, element): + from iris.cube import Cube + # Deal with CF-netCDF units, and add the name+units properties. if isinstance(element, iris.coords.Coord): # Fix "degree" units if needed. @@ -1715,19 +1697,21 @@ def _set_cf_var_attributes(self, cf_var, element): if element.units.calendar: _setncattr(cf_var, "calendar", str(element.units.calendar)) - # Add any other custom coordinate attributes. - for name in sorted(element.attributes): - value = element.attributes[name] + if not isinstance(element, Cube): + # Add any other custom coordinate attributes. + # N.B. not Cube, which has specific handling in _create_cf_data_variable + for name in sorted(element.attributes): + value = element.attributes[name] - if name == "STASH": - # Adopting provisional Metadata Conventions for representing MO - # Scientific Data encoded in NetCDF Format. - name = "um_stash_source" - value = str(value) + if name == "STASH": + # Adopting provisional Metadata Conventions for representing MO + # Scientific Data encoded in NetCDF Format. + name = "um_stash_source" + value = str(value) - # Don't clobber existing attributes. - if not hasattr(cf_var, name): - _setncattr(cf_var, name, value) + # Don't clobber existing attributes. + if not hasattr(cf_var, name): + _setncattr(cf_var, name, value) def _create_generic_cf_array_var( self, @@ -1739,6 +1723,7 @@ def _create_generic_cf_array_var( element_dims=None, fill_value=None, compression_kwargs=None, + is_dataless=False, ): """Create theCF-netCDF variable given dimensional_metadata. @@ -1791,7 +1776,7 @@ def _create_generic_cf_array_var( # Work out the var-name to use. # N.B. the only part of this routine that may use a mesh _or_ a cube. - cf_name = self._get_coord_variable_name(cube_or_mesh, element) + cf_name = self._get_element_variable_name(cube_or_mesh, element) while cf_name in self._dataset.variables: cf_name = self._increment_name(cf_name) @@ -1804,10 +1789,13 @@ def _create_generic_cf_array_var( # Get the data values, in a way which works for any element type, as # all are subclasses of _DimensionalMetadata. # (e.g. =points if a coord, =data if an ancillary, etc) - data = element._core_values() + if isinstance(element, Cube): + data = element.core_data() + else: + data = element._core_values() # This compression contract is *not* applicable to a mesh. - if cube and cube.shape != data.shape: + if cube is not None and data is not None and cube.shape != data.shape: compression_kwargs = {} if np.issubdtype(data.dtype, np.str_): @@ -1837,11 +1825,13 @@ def _create_generic_cf_array_var( # Convert data from an array of strings into a character array # with an extra string-length dimension. if len(element_dims) == 1: + # Scalar variable (only has string dimension). data_first = data[0] if is_lazy_data(data_first): data_first = dask.compute(data_first) data = list("%- *s" % (string_dimension_depth, data_first)) else: + # NOTE: at present, can't do this lazily?? orig_shape = data.shape new_shape = orig_shape + (string_dimension_depth,) new_data = np.zeros(new_shape, cf_var.dtype) @@ -1850,7 +1840,7 @@ def _create_generic_cf_array_var( new_data[index_slice] = list( "%- *s" % (string_dimension_depth, data[index]) ) - data = new_data + data = new_data else: # A normal (numeric) variable. # ensure a valid datatype for the file format. @@ -1887,7 +1877,8 @@ def _create_generic_cf_array_var( ) # Add the data to the CF-netCDF variable. - self._lazy_stream_data(data=data, cf_var=cf_var) + if not is_dataless: + self._lazy_stream_data(data=data, cf_var=cf_var) # Add names + units self._set_cf_var_attributes(cf_var, element) @@ -2238,9 +2229,9 @@ def _create_cf_grid_mapping(self, cube, cf_var_cube): cfvar = self._name_coord_map.name(coord) if not cfvar: # not found - create and store it: - cfvar = self._get_coord_variable_name(cube, coord) + cfvar = self._get_element_variable_name(cube, coord) self._name_coord_map.append( - cfvar, self._get_coord_variable_name(cube, coord) + cfvar, self._get_element_variable_name(cube, coord) ) cfvar_names.append(cfvar) @@ -2383,32 +2374,43 @@ def set_packing_ncattrs(cfvar): if add_offset: _setncattr(cfvar, "add_offset", add_offset) - cf_name = self._get_cube_variable_name(cube) - while cf_name in self._dataset.variables: - cf_name = self._increment_name(cf_name) - + # cf_name = self._get_element_variable_name(cube_or_mesh=None, element=cube) + # while cf_name in self._dataset.variables: + # cf_name = self._increment_name(cf_name) + # + # cf_var = self._dataset.createVariable( + # cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs + # ) # Create the cube CF-netCDF data variable with data payload. - cf_var = self._dataset.createVariable( - cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs + cf_name = self._create_generic_cf_array_var( + cube, + dimension_names, + cube, + element_dims=dimension_names, + fill_value=fill_value, + compression_kwargs=kwargs, + is_dataless=is_dataless, ) + cf_var = self._dataset.variables[cf_name] if not is_dataless: set_packing_ncattrs(cf_var) - self._lazy_stream_data(data=data, cf_var=cf_var) - - if cube.standard_name: - _setncattr(cf_var, "standard_name", cube.standard_name) - - if cube.long_name: - _setncattr(cf_var, "long_name", cube.long_name) - - if cube.units.is_udunits(): - _setncattr(cf_var, "units", str(cube.units)) - - # Add the CF-netCDF calendar attribute. - if cube.units.calendar: - _setncattr(cf_var, "calendar", cube.units.calendar) + # if cube.standard_name: + # _setncattr(cf_var, "standard_name", cube.standard_name) + # + # if cube.long_name: + # _setncattr(cf_var, "long_name", cube.long_name) + # + # if cube.units.is_udunits(): + # _setncattr(cf_var, "units", str(cube.units)) + # + # # Add the CF-netCDF calendar attribute. + # if cube.units.calendar: + # _setncattr(cf_var, "calendar", cube.units.calendar) + + # Set attributes: NB this part is cube-specific (not the same for components) + # - therefore 'set_cf_var_attributes' doesn't set attributes if element is a Cube if iris.FUTURE.save_split_attrs: attr_names = cube.attributes.locals.keys() else: diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index feb93047dd..a3ce9f9128 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -101,6 +101,7 @@ def show_result(filepath): @pytest.mark.parametrize("encoding", tsts) def test_encodings(encoding): + # small change print(f"\n=========\nTesting encoding: {encoding}") filepath = f"tmp_{str(encoding)}.nc" do_as = encoding From 24c299f36bd8ba4a4dbc8792e7406affad87bab1 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Tue, 28 Oct 2025 21:11:15 +0000 Subject: [PATCH 03/10] Reinstate decode on load, now in-Iris coded. --- .../fileformats/_nc_load_rules/helpers.py | 10 ++- lib/iris/fileformats/cf.py | 18 +++++- .../fileformats/netcdf/_thread_safe_nc.py | 45 +++++++++++-- lib/iris/fileformats/netcdf/loader.py | 38 ++++++++++- lib/iris/fileformats/netcdf/saver.py | 4 +- .../integration/netcdf/test_chararrays.py | 64 ++++++++++++++++++- lib/iris/util.py | 21 ++++++ 7 files changed, 184 insertions(+), 16 deletions(-) diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 50e282db5f..fa63002f09 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -1643,9 +1643,13 @@ def _add_auxiliary_coordinate( # Determine the name of the dimension/s shared between the CF-netCDF data variable # and the coordinate being built. - common_dims = [ - dim for dim in cf_coord_var.dimensions if dim in engine.cf_var.dimensions - ] + coord_dims = cf_coord_var.dimensions + if cf._is_str_dtype(cf_coord_var): + coord_dims = coord_dims[:-1] + datavar_dims = engine.cf_var.dimensions + if cf._is_str_dtype(engine.cf_var): + datavar_dims = datavar_dims[:-1] + common_dims = [dim for dim in coord_dims if dim in datavar_dims] data_dims = None if common_dims: # Calculate the offset of each common dimension. diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index b65ab70792..5abc525109 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -790,15 +790,27 @@ def cf_label_data(self, cf_data_var): # Determine the name of the label string (or length) dimension by # finding the dimension name that doesn't exist within the data dimensions. - str_dim_name = list(set(self.dimensions) - set(cf_data_var.dimensions)) + str_dim_names = list(set(self.dimensions) - set(cf_data_var.dimensions)) + n_nondata_dims = len(str_dim_names) + + if n_nondata_dims == 0: + # *All* dims are shared with the data-variable. + # This is only ok if the data-var is *also* a string type. + dim_ok = _is_str_dtype(cf_data_var) + # In this case, we must just *assume* that the last dimension is "the" + # string dimension + str_dim_name = self.dimensions[-1] + else: + # If there is exactly one non-data dim, that is the one we want + dim_ok = len(str_dim_names) == 1 + (str_dim_name,) = str_dim_names - if len(str_dim_name) != 1: + if not dim_ok: raise ValueError( "Invalid string dimensions for CF-netCDF label variable %r" % self.cf_name ) - str_dim_name = str_dim_name[0] label_data = self[:] if ma.isMaskedArray(label_data): diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 35588eb2c4..9e18be4395 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -310,14 +310,39 @@ def fromcdl(cls, *args, **kwargs): class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" - __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") - - def __init__(self, shape, dtype, path, variable_name, fill_value): + __slots__ = ( + "shape", + "dtype", + "path", + "variable_name", + "fill_value", + "is_bytes", + "encoding", + "string_length", + ) + + def __init__( + self, + shape, + dtype, + path, + variable_name, + fill_value, + encoding: str | None = None, + string_length: int = 0, + ): self.shape = shape self.dtype = dtype self.path = path self.variable_name = variable_name self.fill_value = fill_value + self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1 + if self.is_bytes: + # We will be returning a different shape : the last dim is the byte-length + self.shape = self.shape[:-1] + self.dtype = np.dtype(f"U{string_length}") + self.encoding = encoding + self.string_length = string_length @property def ndim(self): @@ -337,10 +362,20 @@ def __getitem__(self, keys): try: variable = dataset.variables[self.variable_name] # Get the NetCDF variable data and slice. - var = variable[keys] + data = variable[keys] + + # If bytes, decode to strings + if self.is_bytes: + from iris.util import convert_bytesarray_to_strings + + data = convert_bytesarray_to_strings( + data, + encoding=self.encoding, + string_length=self.string_length, + ) finally: dataset.close() - return np.asanyarray(var) + return np.asanyarray(data) def __repr__(self): fmt = ( diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index e8d283beb8..6c0b599a4d 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -11,6 +11,7 @@ """ +import codecs from collections.abc import Iterable, Iterator, Mapping from contextlib import contextmanager from copy import deepcopy @@ -269,10 +270,36 @@ def _get_cf_var_data(cf_var): # Normal NCVariable type: total_bytes = cf_var.size * cf_var.dtype.itemsize + default_encoding = "utf-8" + encoding = getattr(cf_var, "_Encoding", None) + if encoding is None: + # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data + encoding = default_encoding + else: + try: + # Accept + normalise naming of encodings + encoding = codecs.lookup(encoding).name + # NOTE: if encoding does not suit data, errors can occur. + # For example, _Encoding = "ascii", with non-ascii content. + except LookupError: + # Replace some invalid setting with "safe"(ish) fallback. + encoding = default_encoding + + string_length = getattr(cf_var, "iris_string_length", None) + if total_bytes < _LAZYVAR_MIN_BYTES: # Don't make a lazy array, as it will cost more memory AND more time to access. result = cf_var[:] + if result.dtype.kind == "S": + from iris.util import convert_bytesarray_to_strings + + result = convert_bytesarray_to_strings( + result, + encoding=encoding, + string_length=string_length, + ) + # Special handling of masked scalar value; this will be returned as # an `np.ma.masked` instance which will lose the original dtype. # Workaround for this it return a 1-element masked array of the @@ -295,8 +322,17 @@ def _get_cf_var_data(cf_var): "_FillValue", _thread_safe_nc.default_fillvals[fill_dtype], ) + + # NOTE: if the data is bytes which need to be converted to strings on read, + # the data-proxy will do that (and it modifies its shape + dtype). proxy = NetCDFDataProxy( - cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value + cf_var.shape, + dtype, + cf_var.filename, + cf_var.cf_name, + fill_value, + encoding=encoding, + string_length=string_length, ) # Get the chunking specified for the variable : this is either a shape, or # maybe the string "contiguous". diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index bd4e87471f..d885387a7f 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element): name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.elements(element): - # It is a regular cube elementinate. + if cube is not None and cube.coords(element): + # It is a regular cube coordinate. # Auto-generate a name based on the dims. name = "" for dim in cube.coord_dims(element): diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index a3ce9f9128..8f29fcdcd5 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -4,10 +4,18 @@ import iris +iris.FUTURE.save_split_attrs = True + + NX, N_STRLEN = 3, 64 TEST_STRINGS = ["Münster", "London", "Amsterdam"] TEST_COORD_VALS = ["bun", "éclair", "sandwich"] +# VARS_COORDS_SHARE_STRING_DIM = True +VARS_COORDS_SHARE_STRING_DIM = False +if VARS_COORDS_SHARE_STRING_DIM: + TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one + def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): bbytes = [text.encode(encoding) for text in string_array_1d] @@ -17,9 +25,33 @@ def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): return chararray +def convert_bytesarray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result + + INCLUDE_COORD = True # INCLUDE_COORD = False +INCLUDE_NUMERIC_AUXCOORD = True +# INCLUDE_NUMERIC_AUXCOORD = False + def make_testfile(filepath, chararray, coordarray, encoding_str=None): with nc.Dataset(filepath, "w") as ds: @@ -40,6 +72,13 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): v_co[:] = coordarray if encoding_str is not None: v_co._Encoding = encoding_str + if INCLUDE_NUMERIC_AUXCOORD: + v_num = ds.createVariable( + "v_num", + float, + dimensions=("x",), + ) + v_num[:] = np.arange(NX) v = ds.createVariable( "v", "S1", @@ -52,7 +91,10 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): if encoding_str is not None: v._Encoding = encoding_str if INCLUDE_COORD: - v.coordinates = "v_co" + coords_str = "v_co" + if INCLUDE_NUMERIC_AUXCOORD: + coords_str += " v_num" + v.coordinates = coords_str def show_result(filepath): @@ -82,8 +124,10 @@ def show_result(filepath): ) print("-data-") print(repr(cube.data)) + print("-numeric auxcoord data-") + print(repr(cube.coord("x").points)) if INCLUDE_COORD: - print("-coord data-") + print("-string auxcoord data-") try: print(repr(cube.coord("v_co").points)) except Exception as err2: @@ -111,3 +155,19 @@ def test_encodings(encoding): TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as) make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) show_result(filepath) + + +# @pytest.mark.parametrize("ndim", [1, 2]) +# def test_convert_bytes_to_strings(ndim: int): +# if ndim == 1: +# source = convert_strings_to_chararray(TEST_STRINGS, 16) +# elif ndim == 2: +# source = np.stack([ +# convert_strings_to_chararray(TEST_STRINGS, 16), +# convert_strings_to_chararray(TEST_COORD_VALS, 16), +# ]) +# else: +# raise ValueError(f"Unexpected param ndim={ndim}.") +# # convert the strings to bytes +# result = convert_bytesarray_to_strings(source) +# print(result) diff --git a/lib/iris/util.py b/lib/iris/util.py index b3ce7941c5..d154d42e16 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -2999,3 +2999,24 @@ def set( # Global CML settings object for use as context manager CML_SETTINGS: CMLSettings = CMLSettings() + + +def convert_bytesarray_to_strings( + byte_array, encoding="utf-8", string_length: int | None = None +): + """Convert bytes to strings. + + N.B. for now at least, we assume the string dim is **always the last one**. + """ + bytes_shape = byte_array.shape + var_shape = bytes_shape[:-1] + if string_length is None: + string_length = bytes_shape[-1] + string_dtype = f"U{string_length}" + result = np.empty(var_shape, dtype=string_dtype) + for ndindex in np.ndindex(var_shape): + element_bytes = byte_array[ndindex] + bytes = b"".join([b if b else b"\0" for b in element_bytes]) + string = bytes.decode(encoding) + result[ndindex] = string + return result From 29142dc80434e01662751f386d6f7cae7c8c57e6 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sun, 7 Dec 2025 00:34:53 +0000 Subject: [PATCH 04/10] Revert and amend. --- .../fileformats/netcdf/_thread_safe_nc.py | 45 +++---------------- lib/iris/fileformats/netcdf/loader.py | 38 +--------------- lib/iris/fileformats/netcdf/saver.py | 4 +- lib/iris/util.py | 21 --------- 4 files changed, 8 insertions(+), 100 deletions(-) diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 9e18be4395..35588eb2c4 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -310,39 +310,14 @@ def fromcdl(cls, *args, **kwargs): class NetCDFDataProxy: """A reference to the data payload of a single NetCDF file variable.""" - __slots__ = ( - "shape", - "dtype", - "path", - "variable_name", - "fill_value", - "is_bytes", - "encoding", - "string_length", - ) - - def __init__( - self, - shape, - dtype, - path, - variable_name, - fill_value, - encoding: str | None = None, - string_length: int = 0, - ): + __slots__ = ("shape", "dtype", "path", "variable_name", "fill_value") + + def __init__(self, shape, dtype, path, variable_name, fill_value): self.shape = shape self.dtype = dtype self.path = path self.variable_name = variable_name self.fill_value = fill_value - self.is_bytes = dtype.kind == "S" and dtype.itemsize == 1 - if self.is_bytes: - # We will be returning a different shape : the last dim is the byte-length - self.shape = self.shape[:-1] - self.dtype = np.dtype(f"U{string_length}") - self.encoding = encoding - self.string_length = string_length @property def ndim(self): @@ -362,20 +337,10 @@ def __getitem__(self, keys): try: variable = dataset.variables[self.variable_name] # Get the NetCDF variable data and slice. - data = variable[keys] - - # If bytes, decode to strings - if self.is_bytes: - from iris.util import convert_bytesarray_to_strings - - data = convert_bytesarray_to_strings( - data, - encoding=self.encoding, - string_length=self.string_length, - ) + var = variable[keys] finally: dataset.close() - return np.asanyarray(data) + return np.asanyarray(var) def __repr__(self): fmt = ( diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 6c0b599a4d..e8d283beb8 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -11,7 +11,6 @@ """ -import codecs from collections.abc import Iterable, Iterator, Mapping from contextlib import contextmanager from copy import deepcopy @@ -270,36 +269,10 @@ def _get_cf_var_data(cf_var): # Normal NCVariable type: total_bytes = cf_var.size * cf_var.dtype.itemsize - default_encoding = "utf-8" - encoding = getattr(cf_var, "_Encoding", None) - if encoding is None: - # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data - encoding = default_encoding - else: - try: - # Accept + normalise naming of encodings - encoding = codecs.lookup(encoding).name - # NOTE: if encoding does not suit data, errors can occur. - # For example, _Encoding = "ascii", with non-ascii content. - except LookupError: - # Replace some invalid setting with "safe"(ish) fallback. - encoding = default_encoding - - string_length = getattr(cf_var, "iris_string_length", None) - if total_bytes < _LAZYVAR_MIN_BYTES: # Don't make a lazy array, as it will cost more memory AND more time to access. result = cf_var[:] - if result.dtype.kind == "S": - from iris.util import convert_bytesarray_to_strings - - result = convert_bytesarray_to_strings( - result, - encoding=encoding, - string_length=string_length, - ) - # Special handling of masked scalar value; this will be returned as # an `np.ma.masked` instance which will lose the original dtype. # Workaround for this it return a 1-element masked array of the @@ -322,17 +295,8 @@ def _get_cf_var_data(cf_var): "_FillValue", _thread_safe_nc.default_fillvals[fill_dtype], ) - - # NOTE: if the data is bytes which need to be converted to strings on read, - # the data-proxy will do that (and it modifies its shape + dtype). proxy = NetCDFDataProxy( - cf_var.shape, - dtype, - cf_var.filename, - cf_var.cf_name, - fill_value, - encoding=encoding, - string_length=string_length, + cf_var.shape, dtype, cf_var.filename, cf_var.cf_name, fill_value ) # Get the chunking specified for the variable : this is either a shape, or # maybe the string "contiguous". diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index d885387a7f..bd4e87471f 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element): name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.coords(element): - # It is a regular cube coordinate. + if cube is not None and cube.elements(element): + # It is a regular cube elementinate. # Auto-generate a name based on the dims. name = "" for dim in cube.coord_dims(element): diff --git a/lib/iris/util.py b/lib/iris/util.py index d154d42e16..b3ce7941c5 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -2999,24 +2999,3 @@ def set( # Global CML settings object for use as context manager CML_SETTINGS: CMLSettings = CMLSettings() - - -def convert_bytesarray_to_strings( - byte_array, encoding="utf-8", string_length: int | None = None -): - """Convert bytes to strings. - - N.B. for now at least, we assume the string dim is **always the last one**. - """ - bytes_shape = byte_array.shape - var_shape = bytes_shape[:-1] - if string_length is None: - string_length = bytes_shape[-1] - string_dtype = f"U{string_length}" - result = np.empty(var_shape, dtype=string_dtype) - for ndindex in np.ndindex(var_shape): - element_bytes = byte_array[ndindex] - bytes = b"".join([b if b else b"\0" for b in element_bytes]) - string = bytes.decode(encoding) - result[ndindex] = string - return result From 1e10dc8a164424a7cf4e4bf22f3df94641eb6bff Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 12:23:07 +0000 Subject: [PATCH 05/10] Hack to preserve the existing order of attributes on saved Coords and Cubes. --- lib/iris/fileformats/netcdf/saver.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index bd4e87471f..8fb0fec377 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1682,12 +1682,26 @@ def _set_cf_var_attributes(self, cf_var, element): else: units_str = str(element.units) - if cf_units.as_unit(units_str).is_udunits(): - _setncattr(cf_var, "units", units_str) + # NB this bit is a nasty hack to preserve existing behaviour through a refactor: + # The attributes for Coords are created in the order units, standard_name, + # whereas for data-variables (aka Cubes) it is the other way around. + # Needed now that this routine is also called from _create_cf_data_variable. + # TODO: when we can break things, rationalise these to be the same. + def add_units(): + if cf_units.as_unit(units_str).is_udunits(): + _setncattr(cf_var, "units", units_str) + + def add_stdname(): + standard_name = element.standard_name + if standard_name is not None: + _setncattr(cf_var, "standard_name", standard_name) - standard_name = element.standard_name - if standard_name is not None: - _setncattr(cf_var, "standard_name", standard_name) + if isinstance(element, Cube): + add_stdname() + add_units() + else: + add_units() + add_stdname() long_name = element.long_name if long_name is not None: From d853a800ac7b546a3db71a2e348cee0ac15ed8c5 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 14:54:33 +0000 Subject: [PATCH 06/10] Fix for dataless; avoid FUTURE global state change from temporary tests. --- lib/iris/fileformats/netcdf/saver.py | 30 ++++---- .../integration/netcdf/test_chararrays.py | 72 ++++++++++++++++--- 2 files changed, 75 insertions(+), 27 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 8fb0fec377..c2522d8867 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1812,7 +1812,7 @@ def _create_generic_cf_array_var( if cube is not None and data is not None and cube.shape != data.shape: compression_kwargs = {} - if np.issubdtype(data.dtype, np.str_): + if not is_dataless and np.issubdtype(data.dtype, np.str_): # Deal with string-type variables. # Typically CF label variables, but also possibly ancil-vars ? string_dimension_depth = data.dtype.itemsize @@ -1858,8 +1858,13 @@ def _create_generic_cf_array_var( else: # A normal (numeric) variable. # ensure a valid datatype for the file format. - element_type = type(element).__name__ - data = self._ensure_valid_dtype(data, element_type, element) + if is_dataless: + dtype = self._DATALESS_DTYPE + fill_value = self._DATALESS_FILLVALUE + else: + element_type = type(element).__name__ + data = self._ensure_valid_dtype(data, element_type, element) + dtype = data.dtype.newbyteorder("=") # Check if this is a dim-coord. is_dimcoord = cube is not None and element in cube.dim_coords @@ -1873,7 +1878,7 @@ def _create_generic_cf_array_var( # Create the CF-netCDF variable. cf_var = self._dataset.createVariable( cf_name, - data.dtype.newbyteorder("="), + dtype, element_dims, fill_value=fill_value, **compression_kwargs, @@ -2325,19 +2330,12 @@ def _create_cf_data_variable( # be removed. # Get the values in a form which is valid for the file format. is_dataless = cube.is_dataless() - if is_dataless: - data = None - else: - data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) - if is_dataless: - # The variable must have *some* dtype, and it must be maskable - dtype = self._DATALESS_DTYPE - fill_value = self._DATALESS_FILLVALUE - elif not packing: - dtype = data.dtype.newbyteorder("=") - else: - if isinstance(packing, dict): + if not is_dataless: + data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) + if not packing: + dtype = data.dtype.newbyteorder("=") + elif isinstance(packing, dict): if "dtype" not in packing: msg = "The dtype attribute is required for packing." raise ValueError(msg) diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index 8f29fcdcd5..c8bba94671 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -3,9 +3,8 @@ import pytest import iris - -iris.FUTURE.save_split_attrs = True - +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube NX, N_STRLEN = 3, 64 TEST_STRINGS = ["Münster", "London", "Amsterdam"] @@ -17,7 +16,13 @@ TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one -def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): +@pytest.fixture(scope="module", autouse=True) +def enable_split_attrs(): + with iris.FUTURE.context(save_split_attrs=True): + yield + + +def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"): bbytes = [text.encode(encoding) for text in string_array_1d] pad = b"\0" * maxlen bbytes = [(x + pad)[:maxlen] for x in bbytes] @@ -97,6 +102,23 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): v.coordinates = coords_str +def make_testcube( + dataarray, + coordarray, # for now, these are always *string* arrays + encoding_str: str | None = None, +): + cube = Cube(dataarray, var_name="v") + cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0) + if encoding_str is not None: + cube.attributes["_Encoding"] = encoding_str + if INCLUDE_COORD: + co_x = AuxCoord(coordarray, var_name="v_co") + if encoding_str is not None: + co_x.attributes["_Encoding"] = encoding_str + cube.add_aux_coord(co_x, 0) + return cube + + def show_result(filepath): from pp_utils import ncdump @@ -115,12 +137,13 @@ def show_result(filepath): # print(repr(v[:])) print("\nAs iris cube..") try: + iris.loading.LOAD_PROBLEMS.reset() cube = iris.load_cube(filepath) print(cube) - if iris.loading.LOAD_PROBLEMS._problems: + if iris.loading.LOAD_PROBLEMS.problems: print(iris.loading.LOAD_PROBLEMS) print( - "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format()) + "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format()) ) print("-data-") print(repr(cube.data)) @@ -136,27 +159,54 @@ def show_result(filepath): print(repr(err)) -# tsts = (None, "ascii", "utf-8", "utf-32",) +tsts = ( + None, + "ascii", + "utf-8", + "utf-32", +) # tsts = ("utf-8",) # tsts = ("utf-8", "utf-32",) # tsts = ("utf-32",) -tsts = ("utf-8", "ascii", "utf-8") +# tsts = ("utf-8", "ascii", "utf-8") @pytest.mark.parametrize("encoding", tsts) -def test_encodings(encoding): +def test_load_encodings(encoding): # small change print(f"\n=========\nTesting encoding: {encoding}") filepath = f"tmp_{str(encoding)}.nc" do_as = encoding if encoding != "utf-32": do_as = "utf-8" - TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as) - TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as) + TEST_CHARARRAY = convert_strings_to_chararray( + TEST_STRINGS, N_STRLEN, encoding=do_as + ) + TEST_COORDARRAY = convert_strings_to_chararray( + TEST_COORD_VALS, N_STRLEN, encoding=do_as + ) make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) show_result(filepath) +@pytest.mark.parametrize("encoding", tsts) +def test_save_encodings(encoding): + cube = make_testcube( + dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding + ) + print(cube) + filepath = f"tmp_save_{str(encoding)}.nc" + if encoding == "ascii": + with pytest.raises( + UnicodeEncodeError, + match="'ascii' codec can't encode character.*not in range", + ): + iris.save(cube, filepath) + else: + iris.save(cube, filepath) + show_result(filepath) + + # @pytest.mark.parametrize("ndim", [1, 2]) # def test_convert_bytes_to_strings(ndim: int): # if ndim == 1: From 1bcfb19d6d6990e7aeb8ff372d311b847577cc45 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 15:21:31 +0000 Subject: [PATCH 07/10] Further fix to attribute ordering. --- lib/iris/fileformats/netcdf/saver.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index c2522d8867..f80cf154c3 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1687,25 +1687,25 @@ def _set_cf_var_attributes(self, cf_var, element): # whereas for data-variables (aka Cubes) it is the other way around. # Needed now that this routine is also called from _create_cf_data_variable. # TODO: when we can break things, rationalise these to be the same. - def add_units(): + def add_units_attr(): if cf_units.as_unit(units_str).is_udunits(): _setncattr(cf_var, "units", units_str) - def add_stdname(): + def add_names_attrs(): standard_name = element.standard_name if standard_name is not None: _setncattr(cf_var, "standard_name", standard_name) + long_name = element.long_name + if long_name is not None: + _setncattr(cf_var, "long_name", long_name) + if isinstance(element, Cube): - add_stdname() - add_units() + add_names_attrs() + add_units_attr() else: - add_units() - add_stdname() - - long_name = element.long_name - if long_name is not None: - _setncattr(cf_var, "long_name", long_name) + add_units_attr() + add_names_attrs() # Add the CF-netCDF calendar attribute. if element.units.calendar: From ad4960e86a16609ba5b8e0a77e8a0c2537fe7a52 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 29 Oct 2025 18:08:35 +0000 Subject: [PATCH 08/10] Fixes for data packing. --- lib/iris/fileformats/netcdf/saver.py | 64 ++++++++++------------------ 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index f80cf154c3..8d66557cab 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1737,6 +1737,7 @@ def _create_generic_cf_array_var( element_dims=None, fill_value=None, compression_kwargs=None, + packing_controls: dict | None = None, is_dataless=False, ): """Create theCF-netCDF variable given dimensional_metadata. @@ -1864,7 +1865,10 @@ def _create_generic_cf_array_var( else: element_type = type(element).__name__ data = self._ensure_valid_dtype(data, element_type, element) - dtype = data.dtype.newbyteorder("=") + if not packing_controls: + dtype = data.dtype.newbyteorder("=") + else: + dtype = packing_controls["dtype"] # Check if this is a dim-coord. is_dimcoord = cube is not None and element in cube.dim_coords @@ -1897,6 +1901,10 @@ def _create_generic_cf_array_var( # Add the data to the CF-netCDF variable. if not is_dataless: + if packing_controls: + # We must set packing attributes (if any), before assigning values. + for key, value in packing_controls["attributes"]: + _setncattr(cf_var, key, value) self._lazy_stream_data(data=data, cf_var=cf_var) # Add names + units @@ -2331,11 +2339,10 @@ def _create_cf_data_variable( # Get the values in a form which is valid for the file format. is_dataless = cube.is_dataless() - if not is_dataless: + packing_controls = None + if packing and not is_dataless: data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) - if not packing: - dtype = data.dtype.newbyteorder("=") - elif isinstance(packing, dict): + if isinstance(packing, dict): if "dtype" not in packing: msg = "The dtype attribute is required for packing." raise ValueError(msg) @@ -2373,26 +2380,14 @@ def _create_cf_data_variable( else: add_offset = cmin + 2 ** (n - 1) * scale_factor - def set_packing_ncattrs(cfvar): - """Set netCDF packing attributes. - - NOTE: cfvar needs to be a _thread_safe_nc._ThreadSafeWrapper subclass. + packing_controls = { + "dtype": dtype, + "attributes": [ + ("scale_factor", scale_factor), + ("add_offset", add_offset), + ], + } - """ - assert hasattr(cfvar, "THREAD_SAFE_FLAG") - if packing: - if scale_factor: - _setncattr(cfvar, "scale_factor", scale_factor) - if add_offset: - _setncattr(cfvar, "add_offset", add_offset) - - # cf_name = self._get_element_variable_name(cube_or_mesh=None, element=cube) - # while cf_name in self._dataset.variables: - # cf_name = self._increment_name(cf_name) - # - # cf_var = self._dataset.createVariable( - # cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs - # ) # Create the cube CF-netCDF data variable with data payload. cf_name = self._create_generic_cf_array_var( cube, @@ -2401,28 +2396,13 @@ def set_packing_ncattrs(cfvar): element_dims=dimension_names, fill_value=fill_value, compression_kwargs=kwargs, + packing_controls=packing_controls, is_dataless=is_dataless, ) cf_var = self._dataset.variables[cf_name] - if not is_dataless: - set_packing_ncattrs(cf_var) - - # if cube.standard_name: - # _setncattr(cf_var, "standard_name", cube.standard_name) - # - # if cube.long_name: - # _setncattr(cf_var, "long_name", cube.long_name) - # - # if cube.units.is_udunits(): - # _setncattr(cf_var, "units", str(cube.units)) - # - # # Add the CF-netCDF calendar attribute. - # if cube.units.calendar: - # _setncattr(cf_var, "calendar", cube.units.calendar) - - # Set attributes: NB this part is cube-specific (not the same for components) - # - therefore 'set_cf_var_attributes' doesn't set attributes if element is a Cube + # Set general attrs: NB this part is cube-specific (not the same for components) + # - so 'set_cf_var_attributes' *doesn't* set these, if element is a Cube if iris.FUTURE.save_split_attrs: attr_names = cube.attributes.locals.keys() else: From ca5872a7a10da9df9258d29ca17f4692ffeb319f Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sun, 7 Dec 2025 00:42:34 +0000 Subject: [PATCH 09/10] Latest test-chararrays. --- .../integration/netcdf/test_chararrays.py | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py index c8bba94671..0eb211c8b0 100644 --- a/lib/iris/tests/integration/netcdf/test_chararrays.py +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -1,10 +1,19 @@ -import netCDF4 as nc +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for string data handling.""" + +import subprocess + import numpy as np import pytest import iris from iris.coords import AuxCoord, DimCoord from iris.cube import Cube +from iris.fileformats.netcdf import _thread_safe_nc +from iris.tests import env_bin_path NX, N_STRLEN = 3, 64 TEST_STRINGS = ["Münster", "London", "Amsterdam"] @@ -16,6 +25,7 @@ TEST_COORD_VALS[-1] = "Xsandwich" # makes the max coord strlen same as data one +# Ensure all tests run with "split attrs" turned on. @pytest.fixture(scope="module", autouse=True) def enable_split_attrs(): with iris.FUTURE.context(save_split_attrs=True): @@ -59,7 +69,8 @@ def convert_bytesarray_to_strings( def make_testfile(filepath, chararray, coordarray, encoding_str=None): - with nc.Dataset(filepath, "w") as ds: + ds = _thread_safe_nc.DatasetWrapper(filepath, "w") + try: ds.createDimension("x", NX) ds.createDimension("nstr", N_STRLEN) vx = ds.createVariable("x", int, dimensions=("x")) @@ -100,6 +111,8 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None): if INCLUDE_NUMERIC_AUXCOORD: coords_str += " v_num" v.coordinates = coords_str + finally: + ds.close() def make_testcube( @@ -119,12 +132,19 @@ def make_testcube( return cube -def show_result(filepath): - from pp_utils import ncdump +NCDUMP_PATHSTR = str(env_bin_path("ncdump")) + + +def ncdump(nc_path: str, *args): + """Call ncdump to print a dump of a file.""" + call_args = [NCDUMP_PATHSTR, nc_path] + list(*args) + subprocess.run(call_args, check=True) + +def show_result(filepath): print(f"File {filepath}") print("NCDUMP:") - ncdump(filepath, "") + ncdump(filepath) # with nc.Dataset(filepath, "r") as ds: # v = ds.variables["v"] # print("\n----\nNetcdf data readback (basic)") @@ -159,6 +179,13 @@ def show_result(filepath): print(repr(err)) +@pytest.fixture(scope="session") +def save_dir(tmp_path_factory): + return tmp_path_factory.mktemp("save_files") + + +# TODO: the tests don't test things properly yet, they just exercise the code and print +# things for manual debugging. tsts = ( None, "ascii", @@ -172,10 +199,10 @@ def show_result(filepath): @pytest.mark.parametrize("encoding", tsts) -def test_load_encodings(encoding): +def test_load_encodings(encoding, save_dir): # small change print(f"\n=========\nTesting encoding: {encoding}") - filepath = f"tmp_{str(encoding)}.nc" + filepath = save_dir / f"tmp_load_{str(encoding)}.nc" do_as = encoding if encoding != "utf-32": do_as = "utf-8" @@ -190,12 +217,12 @@ def test_load_encodings(encoding): @pytest.mark.parametrize("encoding", tsts) -def test_save_encodings(encoding): +def test_save_encodings(encoding, save_dir): cube = make_testcube( dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding ) print(cube) - filepath = f"tmp_save_{str(encoding)}.nc" + filepath = save_dir / f"tmp_save_{str(encoding)}.nc" if encoding == "ascii": with pytest.raises( UnicodeEncodeError, @@ -205,19 +232,3 @@ def test_save_encodings(encoding): else: iris.save(cube, filepath) show_result(filepath) - - -# @pytest.mark.parametrize("ndim", [1, 2]) -# def test_convert_bytes_to_strings(ndim: int): -# if ndim == 1: -# source = convert_strings_to_chararray(TEST_STRINGS, 16) -# elif ndim == 2: -# source = np.stack([ -# convert_strings_to_chararray(TEST_STRINGS, 16), -# convert_strings_to_chararray(TEST_COORD_VALS, 16), -# ]) -# else: -# raise ValueError(f"Unexpected param ndim={ndim}.") -# # convert the strings to bytes -# result = convert_bytesarray_to_strings(source) -# print(result) From a8d022e4613e8f87b6fbb9f22dd0695dcefa63ee Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Sun, 7 Dec 2025 10:43:18 +0000 Subject: [PATCH 10/10] Fix search+replace error. --- lib/iris/fileformats/netcdf/saver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 8d66557cab..4766054142 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -1578,8 +1578,8 @@ def _get_element_variable_name(self, cube_or_mesh, element): name = element.standard_name or element.long_name if not name or set(name).intersection(string.whitespace): # We need to invent a name, based on its associated dimensions. - if cube is not None and cube.elements(element): - # It is a regular cube elementinate. + if cube is not None and cube.coords(element): + # It is a regular cube coordinate. # Auto-generate a name based on the dims. name = "" for dim in cube.coord_dims(element):