From ea536e60c3bd8d5dde27465d064940b302b5c58f Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 31 Oct 2025 15:38:04 +0000
Subject: [PATCH 1/7] Initial dataset wrappers.

Rename; addin parts of old investigation; add temporary notes.
---
 .../netcdf/_bytecoding_datasets.py            | 182 ++++++++++++++
 .../fileformats/netcdf/_thread_safe_nc.py     |  15 +-
 .../integration/netcdf/test_chararrays.py     | 234 ++++++++++++++++++
 .../fileformats/netcdf/encoding_tests.txt     |  18 ++
 .../netcdf/test_bytecoding_datasets.py        |  14 ++
 5 files changed, 457 insertions(+), 6 deletions(-)
 create mode 100644 lib/iris/fileformats/netcdf/_bytecoding_datasets.py
 create mode 100644 lib/iris/tests/integration/netcdf/test_chararrays.py
 create mode 100644 lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
 create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
new file mode 100644
index 0000000000..41e801d103
--- /dev/null
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -0,0 +1,182 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Module providing to netcdf datasets with automatic character encoding.
+
+The requirement is to convert numpy fixed-width unicode arrays on writing to a variable
+which is declared as a byte (character) array with a fixed-length string dimension.
+
+Numpy unicode string arrays are ones with dtypes of the form "U<character-width>".
+Numpy character variables have the dtype "S1", and map to a fixed-length "string
+dimension".
+
+In principle, netCDF4 already performs these translations, but in practice current
+releases are not functional for anything other than "ascii" encoding -- including UTF-8,
+which is the most obvious and desirable "general" solution.
+
+There is also the question of whether we should like to implement UTF-8 as our default.
+Current discussions on this are inconclusive and neither CF conventions nor the NetCDF
+User Guide are definite on what possible values of "_Encoding" are, or what the effective
+default is, even though they do both mention the "_Encoding" attribute as a potential
+way to handle the issue.
+
+Because of this, we interpret as follows:
+  * when reading bytes : in the absence of an "_Encoding" attribute, we will attempt to
+    decode bytes as UTF-8
+  * when writing strings : in the absence of an "_Encoding" attribute (on the Iris
+    cube or coord object), we will attempt to encode data with "ascii" : If this fails,
+    it raise an error prompting the user to supply an "_Encoding" attribute.
+
+Where an "_Encoding" attribute is provided to Iris, we will honour it where possible,
+identifying with "codecs.lookup" :  This means we support the encodings in the Python
+Standard Library, and the name aliases which it recognises.
+
+See:
+
+* known problems https://github.com/Unidata/netcdf4-python/issues/1440
+* suggestions for how this "ought" to work, discussed in the netcdf-c library
+   * https://github.com/Unidata/netcdf-c/issues/402
+
+"""
+
+import codecs
+import warnings
+
+import numpy as np
+
+from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper, VariableWrapper
+
+
+def decode_bytesarray_to_stringarray(
+    byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None
+) -> np.ndarray:
+    """Convert an array of bytes to an array of strings, with one less dimension.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_width is None:
+        string_width = bytes_shape[-1]
+    string_dtype = f"U{string_width}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
+def encode_stringarray_as_bytearray(
+    data: np.ndarray, encoding=None, string_dimension_length: int | None = None
+) -> np.ndarray:
+    """Encode strings as bytearray.
+
+    Note: if 'string_dimension_length' is not given (None), it is set to the longest
+    encoded bytes element.  If 'string_dimension_length' is specified, the last array
+    dimension is set to this and content strings are truncated or extended as required.
+    """
+    element_shape = data.shape
+    max_length = 1  # this is a MINIMUM - i.e. not zero!
+    data_elements = np.zeros(element_shape, dtype=object)
+    for index in np.ndindex(element_shape):
+        data_element = data[index].encode(encoding=encoding)
+        element_length = len(data_element)
+        data_elements[index] = data_element
+        if element_length > max_length:
+            max_length = element_length
+
+    if string_dimension_length is None:
+        string_dimension_length = max_length
+
+    # We already encoded all the strings, but stored them in an object-array as
+    #  we didn't yet know the fixed byte-length to convert to.
+    # Now convert to a fixed-width byte array with an extra string-length dimension
+    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+    right_pad = b"\0" * string_dimension_length
+    for index in np.ndindex(element_shape):
+        bytes = data_elements[index]
+        bytes = (bytes + right_pad)[:string_dimension_length]
+        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
+
+    return result
+
+
+DEFAULT_ENCODING = "utf-8"
+
+
+class EncodedVariable(VariableWrapper):
+    """A variable wrapper that translates variable data according to byte encodings."""
+
+    def __getitem__(self, keys):
+        if self.is_chardata():
+            super().set_auto_chartostring(False)
+
+        data = super().__getitem__(keys)
+
+        if self.is_chardata():
+            encoding = self.get_byte_encoding()
+            strlen = self.get_string_length()
+            data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+
+        return data
+
+    def __setitem__(self, keys, data):
+        if self.is_chardata():
+            encoding = self.get_byte_encoding()
+            strlen = self.get_string_length()
+            if encoding is not None:
+                data = encode_stringarray_as_bytearray(data, encoding, strlen)
+            else:
+                try:
+                    # Check if all characters are valid ascii
+                    data = encode_stringarray_as_bytearray(data, "ascii", strlen)
+                except UnicodeEncodeError:
+                    data = encode_stringarray_as_bytearray(
+                        data, DEFAULT_ENCODING, strlen
+                    )
+                    # As this was necessary, record the new encoding on the variable
+                    self.set_ncattr("_Encoding", DEFAULT_ENCODING)
+                    msg = (
+                        f"Non-ascii data written to label variable {self.name}. "
+                        f"Applied {DEFAULT_ENCODING!r} encoding, "
+                        f"and set attribute _Encoding={DEFAULT_ENCODING!r}."
+                    )
+                    warnings.warn(msg, UserWarning)
+
+            super().set_auto_chartostring(False)
+
+        super().__setitem__(keys, data)
+
+    def is_chardata(self):
+        return np.issubdtype(self.dtype, np.bytes_)
+
+    def get_encoding(self) -> str | None:
+        """Get the effective byte encoding to be used for this variable."""
+        # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+        result = getattr(self, "_Encoding", None)
+        if result is not None:
+            try:
+                # Accept + normalise naming of encodings
+                result = codecs.lookup(result).name
+                # NOTE: if encoding does not suit data, errors can occur.
+                # For example, _Encoding = "ascii", with non-ascii content.
+            except LookupError:
+                # Replace some invalid setting with "safe"(ish) fallback.
+                msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
+                warnings.warn(msg, UserWarning)
+
+        return result
+
+    def get_string_length(self):
+        """Return the string-length defined for this variable (or None)."""
+        return getattr(self, "iris_string_length", None)
+
+
+class EncodedDataset(DatasetWrapper):
+    """A specialised DatasetWrapper whose variables perform byte encoding."""
+
+    VAR_WRAPPER_CLS = EncodedVariable
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
index 33183ef0fa..46b8609bb7 100644
--- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py
+++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -159,6 +159,9 @@ class GroupWrapper(_ThreadSafeWrapper):
     CONTAINED_CLASS = netCDF4.Group
     # Note: will also accept a whole Dataset object, but that is OK.
     _DUCKTYPE_CHECK_PROPERTIES = ["createVariable"]
+    # Class to use when creating variable wrappers (default=VariableWrapper).
+    # - needed to support _byte_encoded_data.EncodedDataset.
+    VAR_WRAPPER_CLS = VariableWrapper
 
     # All Group API that returns Dimension(s) is wrapped to instead return
     #  DimensionWrapper(s).
@@ -203,7 +206,7 @@ def variables(self) -> typing.Dict[str, VariableWrapper]:
         """
         with _GLOBAL_NETCDF4_LOCK:
             variables_ = self._contained_instance.variables
-        return {k: VariableWrapper.from_existing(v) for k, v in variables_.items()}
+        return {k: self.VAR_WRAPPER_CLS.from_existing(v) for k, v in variables_.items()}
 
     def createVariable(self, *args, **kwargs) -> VariableWrapper:
         """Call createVariable() from netCDF4.Group/Dataset within _GLOBAL_NETCDF4_LOCK.
@@ -216,7 +219,7 @@ def createVariable(self, *args, **kwargs) -> VariableWrapper:
         """
         with _GLOBAL_NETCDF4_LOCK:
             new_variable = self._contained_instance.createVariable(*args, **kwargs)
-        return VariableWrapper.from_existing(new_variable)
+        return self.VAR_WRAPPER_CLS.from_existing(new_variable)
 
     def get_variables_by_attributes(
         self, *args, **kwargs
@@ -234,7 +237,7 @@ def get_variables_by_attributes(
             variables_ = list(
                 self._contained_instance.get_variables_by_attributes(*args, **kwargs)
             )
-        return [VariableWrapper.from_existing(v) for v in variables_]
+        return [self.VAR_WRAPPER_CLS.from_existing(v) for v in variables_]
 
     # All Group API that returns Group(s) is wrapped to instead return
     #  GroupWrapper(s).
@@ -252,7 +255,7 @@ def groups(self):
         """
         with _GLOBAL_NETCDF4_LOCK:
             groups_ = self._contained_instance.groups
-        return {k: GroupWrapper.from_existing(v) for k, v in groups_.items()}
+        return {k: self.__class__.from_existing(v) for k, v in groups_.items()}
 
     @property
     def parent(self):
@@ -268,7 +271,7 @@ def parent(self):
         """
         with _GLOBAL_NETCDF4_LOCK:
             parent_ = self._contained_instance.parent
-        return GroupWrapper.from_existing(parent_)
+        return self.__class__.from_existing(parent_)
 
     def createGroup(self, *args, **kwargs):
         """Call createGroup() from netCDF4.Group/Dataset.
@@ -281,7 +284,7 @@ def createGroup(self, *args, **kwargs):
         """
         with _GLOBAL_NETCDF4_LOCK:
             new_group = self._contained_instance.createGroup(*args, **kwargs)
-        return GroupWrapper.from_existing(new_group)
+        return self.__class__.from_existing(new_group)
 
 
 class DatasetWrapper(GroupWrapper):
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
new file mode 100644
index 0000000000..0eb211c8b0
--- /dev/null
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -0,0 +1,234 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Integration tests for string data handling."""
+
+import subprocess
+
+import numpy as np
+import pytest
+
+import iris
+from iris.coords import AuxCoord, DimCoord
+from iris.cube import Cube
+from iris.fileformats.netcdf import _thread_safe_nc
+from iris.tests import env_bin_path
+
+NX, N_STRLEN = 3, 64
+TEST_STRINGS = ["Münster", "London", "Amsterdam"]
+TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
+
+# VARS_COORDS_SHARE_STRING_DIM = True
+VARS_COORDS_SHARE_STRING_DIM = False
+if VARS_COORDS_SHARE_STRING_DIM:
+    TEST_COORD_VALS[-1] = "Xsandwich"  # makes the max coord strlen same as data one
+
+
+# Ensure all tests run with "split attrs" turned on.
+@pytest.fixture(scope="module", autouse=True)
+def enable_split_attrs():
+    with iris.FUTURE.context(save_split_attrs=True):
+        yield
+
+
+def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
+    bbytes = [text.encode(encoding) for text in string_array_1d]
+    pad = b"\0" * maxlen
+    bbytes = [(x + pad)[:maxlen] for x in bbytes]
+    chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
+    return chararray
+
+
+def convert_bytesarray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
+INCLUDE_COORD = True
+# INCLUDE_COORD = False
+
+INCLUDE_NUMERIC_AUXCOORD = True
+# INCLUDE_NUMERIC_AUXCOORD = False
+
+
+def make_testfile(filepath, chararray, coordarray, encoding_str=None):
+    ds = _thread_safe_nc.DatasetWrapper(filepath, "w")
+    try:
+        ds.createDimension("x", NX)
+        ds.createDimension("nstr", N_STRLEN)
+        vx = ds.createVariable("x", int, dimensions=("x"))
+        vx[:] = np.arange(NX)
+        if INCLUDE_COORD:
+            ds.createDimension("nstr2", N_STRLEN)
+            v_co = ds.createVariable(
+                "v_co",
+                "S1",
+                dimensions=(
+                    "x",
+                    "nstr2",
+                ),
+            )
+            v_co[:] = coordarray
+            if encoding_str is not None:
+                v_co._Encoding = encoding_str
+            if INCLUDE_NUMERIC_AUXCOORD:
+                v_num = ds.createVariable(
+                    "v_num",
+                    float,
+                    dimensions=("x",),
+                )
+                v_num[:] = np.arange(NX)
+        v = ds.createVariable(
+            "v",
+            "S1",
+            dimensions=(
+                "x",
+                "nstr",
+            ),
+        )
+        v[:] = chararray
+        if encoding_str is not None:
+            v._Encoding = encoding_str
+        if INCLUDE_COORD:
+            coords_str = "v_co"
+            if INCLUDE_NUMERIC_AUXCOORD:
+                coords_str += " v_num"
+            v.coordinates = coords_str
+    finally:
+        ds.close()
+
+
+def make_testcube(
+    dataarray,
+    coordarray,  # for now, these are always *string* arrays
+    encoding_str: str | None = None,
+):
+    cube = Cube(dataarray, var_name="v")
+    cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0)
+    if encoding_str is not None:
+        cube.attributes["_Encoding"] = encoding_str
+    if INCLUDE_COORD:
+        co_x = AuxCoord(coordarray, var_name="v_co")
+        if encoding_str is not None:
+            co_x.attributes["_Encoding"] = encoding_str
+        cube.add_aux_coord(co_x, 0)
+    return cube
+
+
+NCDUMP_PATHSTR = str(env_bin_path("ncdump"))
+
+
+def ncdump(nc_path: str, *args):
+    """Call ncdump to print a dump of a file."""
+    call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
+    subprocess.run(call_args, check=True)
+
+
+def show_result(filepath):
+    print(f"File {filepath}")
+    print("NCDUMP:")
+    ncdump(filepath)
+    # with nc.Dataset(filepath, "r") as ds:
+    #     v = ds.variables["v"]
+    #     print("\n----\nNetcdf data readback (basic)")
+    #     try:
+    #         print(repr(v[:]))
+    #     except UnicodeDecodeError as err:
+    #         print(repr(err))
+    #     print("..raw:")
+    #     v.set_auto_chartostring(False)
+    #     print(repr(v[:]))
+    print("\nAs iris cube..")
+    try:
+        iris.loading.LOAD_PROBLEMS.reset()
+        cube = iris.load_cube(filepath)
+        print(cube)
+        if iris.loading.LOAD_PROBLEMS.problems:
+            print(iris.loading.LOAD_PROBLEMS)
+            print(
+                "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format())
+            )
+        print("-data-")
+        print(repr(cube.data))
+        print("-numeric auxcoord data-")
+        print(repr(cube.coord("x").points))
+        if INCLUDE_COORD:
+            print("-string auxcoord data-")
+            try:
+                print(repr(cube.coord("v_co").points))
+            except Exception as err2:
+                print(repr(err2))
+    except UnicodeDecodeError as err:
+        print(repr(err))
+
+
+@pytest.fixture(scope="session")
+def save_dir(tmp_path_factory):
+    return tmp_path_factory.mktemp("save_files")
+
+
+# TODO: the tests don't test things properly yet, they just exercise the code and print
+#  things for manual debugging.
+tsts = (
+    None,
+    "ascii",
+    "utf-8",
+    "utf-32",
+)
+# tsts = ("utf-8",)
+# tsts = ("utf-8", "utf-32",)
+# tsts = ("utf-32",)
+# tsts = ("utf-8", "ascii", "utf-8")
+
+
+@pytest.mark.parametrize("encoding", tsts)
+def test_load_encodings(encoding, save_dir):
+    # small change
+    print(f"\n=========\nTesting encoding: {encoding}")
+    filepath = save_dir / f"tmp_load_{str(encoding)}.nc"
+    do_as = encoding
+    if encoding != "utf-32":
+        do_as = "utf-8"
+    TEST_CHARARRAY = convert_strings_to_chararray(
+        TEST_STRINGS, N_STRLEN, encoding=do_as
+    )
+    TEST_COORDARRAY = convert_strings_to_chararray(
+        TEST_COORD_VALS, N_STRLEN, encoding=do_as
+    )
+    make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
+    show_result(filepath)
+
+
+@pytest.mark.parametrize("encoding", tsts)
+def test_save_encodings(encoding, save_dir):
+    cube = make_testcube(
+        dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding
+    )
+    print(cube)
+    filepath = save_dir / f"tmp_save_{str(encoding)}.nc"
+    if encoding == "ascii":
+        with pytest.raises(
+            UnicodeEncodeError,
+            match="'ascii' codec can't encode character.*not in range",
+        ):
+            iris.save(cube, filepath)
+    else:
+        iris.save(cube, filepath)
+        show_result(filepath)
diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
new file mode 100644
index 0000000000..bab04aa0c4
--- /dev/null
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -0,0 +1,18 @@
+
+forms in files:
+    * char chardata(dim1, dim2, strlen_xx);    # char data
+    * string data(dim1, dim2);
+
+forms in numpy:
+    * np.ndarray(dtype="S1")  # char data
+    * np.ndarray(dtype="Snn")  # char data
+    * np.ndarray(dtype="Unn")  # strings
+    * np.ndarray(dtype="")
+
+possibilities in createVariable:
+"""
+    The datatype can be a numpy datatype object, or a string that describes a numpy dtype object ...
+    datatype can also be a CompoundType instance (for a structured, or compound array), a VLType instance (for a variable-length array),
+**  or the python str builtin (for a variable-length string array).
+**  Numpy string and unicode datatypes with length greater than one are aliases for str.
+"""
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
new file mode 100644
index 0000000000..8b449c5912
--- /dev/null
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -0,0 +1,14 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module."""
+
+# import numpy as np
+# import pytest
+#
+# from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
+
+
+class TestEncodedDataset:
+    """Test how GRIB_PARAM attributes convert to strings for storage in netcdf files."""

From fe1e22d586be798ad442fb793b9ac06ba1049627 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Wed, 3 Dec 2025 18:59:43 +0000
Subject: [PATCH 2/7] Various notes, choices + changes: Beginnings of
 encoded-dataset testing.

---
 .../netcdf/_bytecoding_datasets.py            | 155 ++++++++----
 .../integration/netcdf/test_chararrays.py     |   7 +-
 .../fileformats/netcdf/encoding_tests.txt     | 164 +++++++++++++
 .../netcdf/test_bytecoding_datasets.py        | 223 +++++++++++++++++-
 .../unit/fileformats/netcdf/test_nc_dtypes.py |  96 ++++++++
 5 files changed, 595 insertions(+), 50 deletions(-)
 create mode 100644 lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 41e801d103..353f14d538 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -41,6 +41,8 @@
 """
 
 import codecs
+import contextlib
+import threading
 import warnings
 
 import numpy as np
@@ -49,17 +51,18 @@
 
 
 def decode_bytesarray_to_stringarray(
-    byte_array: np.ndarray, encoding="utf-8", string_width: int | None = None
+    byte_array: np.ndarray, encoding: str, string_width: int
 ) -> np.ndarray:
     """Convert an array of bytes to an array of strings, with one less dimension.
 
     N.B. for now at least, we assume the string dim is **always the last one**.
     If 'string_width' is not given, it is set to the final dimension of 'byte_array'.
     """
+    if np.ma.isMaskedArray(byte_array):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        byte_array = byte_array.data
     bytes_shape = byte_array.shape
     var_shape = bytes_shape[:-1]
-    if string_width is None:
-        string_width = bytes_shape[-1]
     string_dtype = f"U{string_width}"
     result = np.empty(var_shape, dtype=string_dtype)
     for ndindex in np.ndindex(var_shape):
@@ -70,16 +73,25 @@ def decode_bytesarray_to_stringarray(
     return result
 
 
-def encode_stringarray_as_bytearray(
+#
+# TODO: remove?
+# this older version is "overly flexible", less efficient and not needed here.
+#
+def flexi_encode_stringarray_as_bytearray(
     data: np.ndarray, encoding=None, string_dimension_length: int | None = None
 ) -> np.ndarray:
     """Encode strings as bytearray.
 
     Note: if 'string_dimension_length' is not given (None), it is set to the longest
-    encoded bytes element.  If 'string_dimension_length' is specified, the last array
+    encoded bytes element, **OR** the dtype size, if that is greater.
+    If 'string_dimension_length' is specified, the last array
     dimension is set to this and content strings are truncated or extended as required.
     """
+    if np.ma.isMaskedArray(data):
+        # netCDF4-python sees zeros as "missing" -- we don't need or want that
+        data = data.data
     element_shape = data.shape
+    # Encode all the strings + see which is longest
     max_length = 1  # this is a MINIMUM - i.e. not zero!
     data_elements = np.zeros(element_shape, dtype=object)
     for index in np.ndindex(element_shape):
@@ -90,10 +102,15 @@ def encode_stringarray_as_bytearray(
             max_length = element_length
 
     if string_dimension_length is None:
+        # If the string length was not specified, it is the maximum encoded length
+        # (n-bytes), **or** the dtype string-length, if greater.
         string_dimension_length = max_length
+        array_string_length = int(str(data.dtype)[2:])  # Yuck. No better public way?
+        if array_string_length > string_dimension_length:
+            string_dimension_length = array_string_length
 
-    # We already encoded all the strings, but stored them in an object-array as
-    #  we didn't yet know the fixed byte-length to convert to.
+    # We maybe *already* encoded all the strings above, but stored them in an
+    #  object-array as we didn't yet know the fixed byte-length to convert to.
     # Now convert to a fixed-width byte array with an extra string-length dimension
     result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
     right_pad = b"\0" * string_dimension_length
@@ -105,58 +122,98 @@ def encode_stringarray_as_bytearray(
     return result
 
 
-DEFAULT_ENCODING = "utf-8"
+def encode_stringarray_as_bytearray(
+    data: np.ndarray, encoding: str, string_dimension_length: int
+) -> np.ndarray:
+    """Encode strings as a bytes array."""
+    element_shape = data.shape
+    result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+    right_pad = b"\0" * string_dimension_length
+    for index in np.ndindex(element_shape):
+        bytes = data[index].encode(encoding=encoding)
+        # It's all a bit nasty ...
+        bytes = (bytes + right_pad)[:string_dimension_length]
+        result[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
+
+    return result
+
+
+class NetcdfStringDecodeSetting(threading.local):
+    def __init__(self, perform_encoding: bool = True):
+        self.set(perform_encoding)
+
+    def set(self, perform_encoding: bool):
+        self.perform_encoding = perform_encoding
+
+    def __bool__(self):
+        return self.perform_encoding
+
+    @contextlib.contextmanager
+    def context(self, perform_encoding: bool):
+        old_setting = self.perform_encoding
+        self.perform_encoding = perform_encoding
+        yield
+        self.perform_encoding = old_setting
+
+
+DECODE_TO_STRINGS_ON_READ = NetcdfStringDecodeSetting()
+DEFAULT_READ_ENCODING = "utf-8"
+DEFAULT_WRITE_ENCODING = "ascii"
 
 
 class EncodedVariable(VariableWrapper):
     """A variable wrapper that translates variable data according to byte encodings."""
 
     def __getitem__(self, keys):
-        if self.is_chardata():
-            super().set_auto_chartostring(False)
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
 
         data = super().__getitem__(keys)
 
-        if self.is_chardata():
-            encoding = self.get_byte_encoding()
-            strlen = self.get_string_length()
-            data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+        if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
+            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+            # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
+            strlen = self._get_string_length()
+            try:
+                data = decode_bytesarray_to_stringarray(data, encoding, strlen)
+            except UnicodeDecodeError as err:
+                msg = (
+                    f"Character data in variable {self.name!r} could not be decoded"
+                    f"with the {encoding!r} encoding.  This can be fixed by setting the "
+                    "variable '_Encoding' attribute to suit the content."
+                )
+                raise ValueError(msg) from err
 
         return data
 
     def __setitem__(self, keys, data):
-        if self.is_chardata():
-            encoding = self.get_byte_encoding()
-            strlen = self.get_string_length()
-            if encoding is not None:
-                data = encode_stringarray_as_bytearray(data, encoding, strlen)
-            else:
+        if self._is_chardata():
+            # N.B. we never need to UNset this, as we totally control it
+            self._contained_instance.set_auto_chartostring(False)
+
+            encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
+            # N.B. typically, write encoding default is "ascii" --> fails bad content
+            if data.dtype.kind == "U":
                 try:
-                    # Check if all characters are valid ascii
-                    data = encode_stringarray_as_bytearray(data, "ascii", strlen)
-                except UnicodeEncodeError:
-                    data = encode_stringarray_as_bytearray(
-                        data, DEFAULT_ENCODING, strlen
-                    )
-                    # As this was necessary, record the new encoding on the variable
-                    self.set_ncattr("_Encoding", DEFAULT_ENCODING)
+                    strlen = self._get_string_length()
+                    data = encode_stringarray_as_bytearray(data, encoding, strlen)
+                except UnicodeEncodeError as err:
                     msg = (
-                        f"Non-ascii data written to label variable {self.name}. "
-                        f"Applied {DEFAULT_ENCODING!r} encoding, "
-                        f"and set attribute _Encoding={DEFAULT_ENCODING!r}."
+                        f"String data written to netcdf character variable {self.name!r} "
+                        f"could not be represented in encoding {encoding!r}.  This can be "
+                        "fixed by setting a suitable variable '_Encoding' attribute, "
+                        'e.g. <variable>._Encoding="UTF-8".'
                     )
-                    warnings.warn(msg, UserWarning)
-
-            super().set_auto_chartostring(False)
+                    raise ValueError(msg) from err
 
         super().__setitem__(keys, data)
 
-    def is_chardata(self):
+    def _is_chardata(self):
         return np.issubdtype(self.dtype, np.bytes_)
 
-    def get_encoding(self) -> str | None:
-        """Get the effective byte encoding to be used for this variable."""
-        # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+    def _get_encoding(self) -> str | None:
+        """Get the byte encoding defined for this variable (or None)."""
         result = getattr(self, "_Encoding", None)
         if result is not None:
             try:
@@ -165,18 +222,32 @@ def get_encoding(self) -> str | None:
                 # NOTE: if encoding does not suit data, errors can occur.
                 # For example, _Encoding = "ascii", with non-ascii content.
             except LookupError:
-                # Replace some invalid setting with "safe"(ish) fallback.
+                # Unrecognised encoding name : handle this as just a warning
                 msg = f"Unknown encoding for variable {self.name!r}: {result!r}"
                 warnings.warn(msg, UserWarning)
 
         return result
 
-    def get_string_length(self):
-        """Return the string-length defined for this variable (or None)."""
-        return getattr(self, "iris_string_length", None)
+    def _get_string_length(self):
+        """Return the string-length defined for this variable."""
+        if not hasattr(self, "_strlen"):
+            # Work out the string length from the parent dataset dimensions.
+            strlen = self.group().dimensions[self.dimensions[-1]].size
+            # Cache this on the variable -- but not as a netcdf attribute (!)
+            self.__dict__["_strlen"] = strlen
+
+        return self._strlen
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedVariable' type."
+        raise TypeError(msg)
 
 
 class EncodedDataset(DatasetWrapper):
     """A specialised DatasetWrapper whose variables perform byte encoding."""
 
     VAR_WRAPPER_CLS = EncodedVariable
+
+    def set_auto_chartostring(self, onoff: bool):
+        msg = "auto_chartostring is not supported by Iris 'EncodedDataset' type."
+        raise TypeError(msg)
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
index 0eb211c8b0..4414444733 100644
--- a/lib/iris/tests/integration/netcdf/test_chararrays.py
+++ b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -137,8 +137,11 @@ def make_testcube(
 
 def ncdump(nc_path: str, *args):
     """Call ncdump to print a dump of a file."""
-    call_args = [NCDUMP_PATHSTR, nc_path] + list(*args)
-    subprocess.run(call_args, check=True)
+    call_args = [NCDUMP_PATHSTR, nc_path] + list(args)
+    bytes = subprocess.check_output(call_args)
+    text = bytes.decode("utf-8")
+    print(text)
+    return text
 
 
 def show_result(filepath):
diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
index bab04aa0c4..e77427cd63 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -1,8 +1,95 @@
+===========
+Outstanding Qs
+* What would we like to do with all this IN IRIS??
+    - generally present as string arrays (Uxx)
+    - existing scheme of naming dims for length + re-using is quite cunning!
+    - choice of seeing actual character arrays as alternative to string conversions?
+
+* string length handling for load/save/roundtrip
+  - on SAVE, we need some control so we can create files which are compatible,
+    irrespective of the data (which currently we are not doing)
+    - ALSO this is wanted to ensure that multiple vars (e.g. string cubes or string coords)
+      will share the string dim -- instead of creating arbitrary different ones
+    - presumably, if encoding blows the max-len, we must get a warning/error
+
+  - on LOAD, we may want to *capture* the actual original string dim length, so it can be
+    re-created on save (by some scheme, as per previous) -- i.e. enable roundtripping.
+    I don't really want to preserve the name of the string dim, but this could be a
+    slightly tender point.  To consider also : the impact of this on the non-equivalence
+    of loaded cubes, if we use actual *attributes* to carry this info (see below).
+    - **if not** : just load data + convert to string arrays as seems best
+        - this will also lead to incompatible cubes.
+
+  - on SAVE, in the absence of strlen-controls, what is a reasonable default choice?
+     - take longest encoded
+     - set nbytes = NEXPAND(encoding) * nchars
+        - sensible values would depend on the encoding...
+            : ascii -> 1
+            : utf-8 -> 1 or 4 ???
+            : utf-16 -> 2 or 4 ???
+            : utf-32 -> 4
+
+  - on LOAD, in absence of strlen controls, how do we choose the result DTYPE (i.e. character length)?
+    - again, may depend on the encoding:
+        : ascii = "U<strlen>"
+        : UTF-8 = "U<strlen>"
+        : UTF-16 = "U<strlen/2>"
+        : UTF-32 = "U<strlen/4>"
+            - N.B. these are ll at least "safe" - i.e. won't lose characters
+
+
+separately from these, there is the question of how the controls affect "normal"
+cube operations.
+    - the easiest approach is to define a "special" attribute,
+      which can be set on any cube/component
+    - using the dtype-length of the data would be *possible*, in conjunction with the
+      above-proposed "default rules" for choosing strlen from the dtype.
+      But this might not round-trip in all cases.
+
+within the actual data arrays
+    - we can't really expect any different to what numpy does
+        - that is, the dtype-length of any element <= that of the array  (and not ==)
+          this may be tricky, but we can't easily prevent it.
+                >>> a = np.array(['', 'a', 'bb'])
+                >>> a
+                array(['', 'a', 'bb'], dtype='<U2')
+                >>> a[0].dtype
+                dtype('<U')
+                >>> a[1].dtype
+                dtype('<U1')
+                >>> a[2].dtype
+                dtype('<U2')
+                >>> a.dtype
+                dtype('<U2')
+                >>>
+    - likewise, we can't assign without possible truncation.
+      If you **want** to expand the supported width, can use ".astype()" first ?
+
+
+========================
+=========================
 
 forms in files:
     * char chardata(dim1, dim2, strlen_xx);    # char data
     * string data(dim1, dim2);
 
+netcdf types:
+(netcdf docs terms)
+    NC_BYTE 8-bit signed integer
+    NC_UBYTE 8-bit unsigned integer
+    NC_CHAR 8-bit character
+    NC_STRING variable length character string
+
+***NOTE*** there is no NC_UCHAR or "unsigned char" type
+
+
+relevant numpy base types (scalar dtypes):
+    * "S" bytes             : np.bytes_ == np.int8
+    * "B" unsigned bytes    : np.ubyte == np.uint8
+    * 'i' ints              : np.int_
+    * 'u' unsigned ints     : np.int_
+    * "U" unicode string    : np.str_
+
 forms in numpy:
     * np.ndarray(dtype="S1")  # char data
     * np.ndarray(dtype="Snn")  # char data
@@ -16,3 +103,80 @@ possibilities in createVariable:
 **  or the python str builtin (for a variable-length string array).
 **  Numpy string and unicode datatypes with length greater than one are aliases for str.
 """
+
+test types:
+    "i1" : np.int8
+    "u1" : np.uint8
+    "S1" : np.byte_
+    "U1" : np.str_
+    "S<n>" :
+    "U<n>" : with/without non-ascii content
+
+save all these to files...
+outputs from "test_nc_dtypes.py" test run:
+  SPEC:i1 SAVED-AS:int8     byte    RELOAD-AS:int8
+  SPEC:u1 SAVED-AS:uint8    ubyte   RELOAD-AS:uint8
+  SPEC:S1 SAVED-AS:|S1      char    RELOAD-AS:<U3
+         **OR*** |S1, if set_auto_chartostring(False)
+         - in which case, dimensions also different : (3,) --> ()
+  SPEC:U1 SAVED-AS:<U1      string  RELOAD-AS:object
+  SPEC:S SAVED-AS:|S5       string  RELOAD-AS:object
+  SPEC:U SAVED-AS:<U6       string  RELOAD-AS:object
+
+
+
+What is relevant/possible :
+* IN netcdf files
+    - variables of type "char"
+
+... investigate the uchar thing...
+  - confirmed there is no such thing
+  - (see commented-out portions of  "test_nc_dtypes.py" -- around "test_uchar")
+
+
+* IN netcdf4-python
+    - reading: variables of type "char" can pre presented as EITHER "S1" OR object (=strings)
+    - writing: likewise, but the format switch is "automatic" ??
+        - i.e. you can pass EITHER "arr(dims + strlen):S1" OR "arr(dims):Unn"
+
+Then, as regards the _Encoding ..
+    - Reading: converts if required, use _Encoding or "UTF-8" (== safe)
+    - Writing: converts if required, use  _Encoding or "ascii" (== fail if unsuited)
+
+
+TO TEST...
+==========
+create a dataset + write char data
+  - X assign different encodings: makes no difference
+
+create a dataset + write STRING data
+  - X encoding=(ascii, utf-8, utf-32, None)
+  - X withnonascii=(T, F)
+  - X length=(long, short, none)
+
+read string data
+    - X encoding=(ascii, utf-8, utf-32, None)
+    - X withnonascii=(T, F)
+
+read char data (with control)
+  - X different encodings: make no difference
+
+==rethought==
+write strings
+    - scalar
+    - 1D
+    - multidm
+    - X encodings
+    - check encoding failures + defaults
+    - check length controls + truncations
+
+read strings
+    - X encodings
+    - decoding failures + defaults
+
+write char data
+    - X encodings: don't matter
+
+read char data
+    - X encodings: don't matter
+
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 8b449c5912..092da19a00 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -4,11 +4,222 @@
 # See LICENSE in the root of the repository for full licensing details.
 """Unit tests for :class:`iris.fileformats.netcdf._bytecoding_datasets` module."""
 
-# import numpy as np
-# import pytest
-#
-# from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from iris.fileformats.netcdf._bytecoding_datasets import (
+    EncodedDataset,
+    encode_stringarray_as_bytearray,
+    flexi_encode_stringarray_as_bytearray,
+)
+from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
+
+encoding_options = [None, "ascii", "utf-8", "utf-32"]
+
+samples_3_ascii = np.array(
+    ["one", "", "seven"],  # N.B. include empty!
+)
+samples_3_nonascii = np.array(["two", "", "épéé"])
+
+
+def strings_maxbytes(strings, encoding):
+    return max(len(string.encode(encoding)) for string in strings)
+
+
+@pytest.fixture(params=encoding_options)
+def encoding(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def tempdir(tmp_path_factory):
+    path = tmp_path_factory.mktemp("netcdf")
+    return path
+
+
+def make_encoded_dataset(
+    path: Path, strlen: int, encoding: str | None = None
+) -> EncodedDataset:
+    """Create a test EncodedDataset linked to an actual file.
+
+    * strlen becomes the string dimension (i.e. a number of *bytes*)
+    * a variable "vxs" is created
+    * If 'encoding' is given, the "vxs::_Encoding" attribute is created with this value
+    """
+    ds = EncodedDataset(path, "w")
+    ds.createDimension("x", 3)
+    ds.createDimension("strlen", strlen)
+    v = ds.createVariable("vxs", "S1", ("x", "strlen"))
+    if encoding is not None:
+        v.setncattr("_Encoding", encoding)
+    return ds
+
+
+def fetch_undecoded_var(path, varname):
+    # Open a path as a "normal" dataset, and return a given variable.
+    ds_normal = DatasetWrapper(path)
+    ds_normal._contained_instance.set_auto_chartostring(False)
+    v = ds_normal.variables[varname]
+    # Return a variable, rather than its data, so we can check attributes etc.
+    return v
+
+
+class TestWriteStrings:
+    """Test how string data is saved to a file."""
+
+    def test_write_strings(self, encoding, tempdir):
+        # Create a dataset with the variable
+        path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc"
+
+        if encoding in [None, "ascii"]:
+            writedata = samples_3_ascii
+            write_encoding = "ascii"
+        else:
+            writedata = samples_3_nonascii
+            write_encoding = encoding
+
+        writedata = writedata.copy()  # just for safety?
+        strlen = strings_maxbytes(writedata, write_encoding)
+
+        ds_encoded = make_encoded_dataset(path, strlen, encoding)
+        v = ds_encoded.variables["vxs"]
+
+        # Effectively, checks that we *can* write strings
+        v[:] = writedata
+
+        # Close, re-open as an "ordinary" dataset, and check the raw content.
+        ds_encoded.close()
+        v = fetch_undecoded_var(path, "vxs")
+
+        # Check that the raw result is as expected
+        bytes_result = v[:]
+        expected = encode_stringarray_as_bytearray(writedata, write_encoding, strlen)
+        assert (
+            bytes_result.shape == expected.shape
+            and bytes_result.dtype == expected.dtype
+            and np.all(bytes_result == expected)
+        )
+
+        # Check that the "_Encoding" property is also as expected
+        result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None
+        assert result_attr == encoding
+
+    def test_scalar(self, tempdir):
+        # Like 'test_write_strings', but the variable has *only* the string dimension.
+        path = tempdir / "test_writestrings_scalar.nc"
+
+        ds_encoded = make_encoded_dataset(path, strlen=5)
+        v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",))
+
+        # Checks that we *can* write a string
+        v[:] = np.array("stuff", dtype=str)
+
+        # Close, re-open as an "ordinary" dataset, and check the raw content.
+        ds_encoded.close()
+        v = fetch_undecoded_var(path, "v0_scalar")
+        result = v[:]
+
+        # Check that the raw result is as expected
+        assert (
+            result.shape == (5,)
+            and result.dtype == "<S1"
+            and np.all(result == [b"s", b"t", b"u", b"f", b"f"])
+        )
+
+    def test_multidim(self, tempdir):
+        # Like 'test_write_strings', but the variable has additional dimensions.
+        path = tempdir / "test_writestrings_multidim.nc"
+
+        ds_encoded = make_encoded_dataset(path, strlen=5)
+        ds_encoded.createDimension("y", 2)
+        v = ds_encoded.createVariable(
+            "vyxn",
+            "S1",
+            (
+                "y",
+                "x",
+                "strlen",
+            ),
+        )
+
+        # Check that we *can* write a multidimensional string array
+        test_data = np.array(
+            [
+                ["one", "n", ""],
+                ["two", "xxxxx", "four"],
+            ],
+            dtype="U5",
+        )
+        v[:] = test_data
+
+        # Close, re-open as an "ordinary" dataset, and check the raw content.
+        ds_encoded.close()
+        v = fetch_undecoded_var(path, "vyxn")
+        result = v[:]
+
+        # Check that the raw result is as expected
+        expected_bytes = encode_stringarray_as_bytearray(
+            test_data, encoding="ascii", string_dimension_length=5
+        )
+        assert (
+            result.shape
+            == (
+                2,
+                3,
+                5,
+            )
+            and result.dtype == "<S1"
+            and np.all(result == expected_bytes)
+        )
+
+    def test_write_encoding_failure(self, tempdir):
+        path = tempdir / "test_writestrings_encoding_failure.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding="ascii")
+        v = ds.variables["vxs"]
+        msg = (
+            "String data written to netcdf character variable 'vxs'.*"
+            " could not be represented in encoding 'ascii'. "
+        )
+        with pytest.raises(ValueError, match=msg):
+            v[:] = samples_3_nonascii
+
+    def test_overlength_warning(self):
+        pass
+
+
+class TestWriteChars:
+    @pytest.mark.parametrize("write_form", ["strings", "bytes"])
+    def test_write_chars(self, tempdir, write_form):
+        encoding = "utf-8"
+        write_strings = samples_3_nonascii
+        write_bytes = flexi_encode_stringarray_as_bytearray(
+            write_strings, encoding=encoding
+        )
+        # NOTE: 'flexi' form util decides the width needs to be 7 !!
+        strlen = write_bytes.shape[-1]
+        path = tempdir / f"test_writechars_{write_form}.nc"
+        ds = make_encoded_dataset(path, encoding=encoding, strlen=strlen)
+        v = ds.variables["vxs"]
+
+        # assign in *either* way..
+        if write_form == "strings":
+            v[:] = write_strings
+        else:
+            v[:] = write_bytes
+
+        # .. the result should be the same
+        result = v[:]
+        assert (
+            result.shape == write_strings.shape
+            and result.dtype == f"<U{strlen}"  # NOTE: we fixed the string width
+            and np.all(result == write_strings)
+        )
+
 
+class TestReadStrings:
+    """Test how character data is read and converted to strings."""
 
-class TestEncodedDataset:
-    """Test how GRIB_PARAM attributes convert to strings for storage in netcdf files."""
+    def test_encodings(self, encoding):
+        pass
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py
new file mode 100644
index 0000000000..0c5d2b279e
--- /dev/null
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_nc_dtypes.py
@@ -0,0 +1,96 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Temporary code to confirm how various numpy dtypes are stored in a netcdf file."""
+
+import netCDF4 as nc
+import numpy as np
+import pytest
+
+from iris.tests.integration.netcdf.test_chararrays import ncdump
+
+# types = [
+#     "i1",  # np.int8
+#     "u1",  # np.uint8
+#     "S1",  # np.byte_
+#     "U1",  # np.str_
+#     "S",  # multibytes
+#     "U",  # unicode strings, with/without non-ascii content
+# ]
+
+samples = {
+    "i1": [-5, 7, 35],  # np.int8
+    "u1": [65, 67, 90],  # np.uint8
+    "S1": [b"A", b"B", b"Z"],  # np.byte_
+    "U1": ["A", "B", "C"],  # np.str_
+    "S": [b"one21", b"three", b""],  # multibyte
+    "U": ["one", "éclair", "nine"],  # unicode strings
+}
+sample_arrays = {
+    type_code: np.array(values, dtype=type_code)
+    for type_code, values in samples.items()
+}
+
+
+@pytest.fixture(scope="module")
+def tmpdir(tmp_path_factory):
+    return tmp_path_factory.mktemp("netcdf")
+
+
+def create_file(array: np.ndarray, path):
+    with nc.Dataset(str(path), "w") as ds:
+        ds.createDimension("x", 3)
+        v = ds.createVariable("vx", array.dtype, ("x",))
+        # v.set_auto_chartostring(False)
+        v._Encoding = "UTF-8" if array.dtype.kind == "U" else "ascii"
+        v[:] = array
+
+
+def get_loadback_array(path):
+    with nc.Dataset(str(path), "r") as ds:
+        v = ds.variables["vx"]
+        v.set_auto_chartostring(False)
+        result = v[:]
+    return result
+
+
+@pytest.mark.parametrize("dtype", list(samples.keys()))
+def test(tmpdir, dtype):
+    arr = sample_arrays[dtype]
+    print("\n---")
+    print(dtype)
+    path = tmpdir / f"tmp_{dtype}.nc"
+    create_file(arr, path)
+    ncdump(path, "-s")
+    loadback_array = get_loadback_array(path)
+    print(f"  SPEC:{dtype} SAVED-AS:{arr.dtype} RELOAD-AS:{loadback_array.dtype}")
+
+
+# from iris.tests import env_bin_path
+# NCGEN_PATHSTR = str(env_bin_path("ncgen"))
+#
+#
+# def ncgen(cdl_path, nc_path, *args):
+#     """Call ncdump to print a dump of a file."""
+#     args = list(args)
+#     if not any(arg.startswith('-k') for arg in args):
+#         args[:0] = ["-k", "nc4"]  # force netcdf4
+#     call_args = [NCGEN_PATHSTR] + list(args) + [str(cdl_path), '-o', str(nc_path)]
+#     subprocess.check_call(call_args)
+#
+#
+# def test_uchar(tmpdir):
+#     arr = sample_arrays["S1"]
+#     path = tmpdir / f"tmp_ichar.nc"
+#     create_file(arr, path)
+#     text = ncdump(path, "-s")
+#     text_u = text.replace("\t", "   ")
+#     text_u = text_u.replace(" char ", " unsigned char ")
+#     cdl_path = tmpdir / f"tmp_uchar.cdl"
+#     with open(cdl_path, "w") as f_out:
+#         f_out.write(text_u)
+#     nc_path_2 = tmpdir / f"tmp_uchar.nc"
+#     ncgen(cdl_path, nc_path_2)
+#     loadback_array = get_loadback_array(nc_path_2)
+#     print(f"  netcdf type 'uchar' LOADS-AS:{loadback_array.dtype}")

From 7baee941a773373cbe98aa7f14859be54a8481a7 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 12:51:04 +0000
Subject: [PATCH 3/7] Replace use of encoding functions with test-specific
 function: Test for overlength writes.

---
 .../netcdf/_bytecoding_datasets.py            |   6 +-
 .../fileformats/netcdf/encoding_tests.txt     |  15 +-
 .../netcdf/test_bytecoding_datasets.py        | 194 ++++++++++++------
 3 files changed, 147 insertions(+), 68 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 353f14d538..62e1dd2ab7 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -123,9 +123,10 @@ def flexi_encode_stringarray_as_bytearray(
 
 
 def encode_stringarray_as_bytearray(
-    data: np.ndarray, encoding: str, string_dimension_length: int
+    data: np.typing.ArrayLike, encoding: str, string_dimension_length: int
 ) -> np.ndarray:
     """Encode strings as a bytes array."""
+    data = np.asanyarray(data)
     element_shape = data.shape
     result = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
     right_pad = b"\0" * string_dimension_length
@@ -179,7 +180,7 @@ def __getitem__(self, keys):
                 data = decode_bytesarray_to_stringarray(data, encoding, strlen)
             except UnicodeDecodeError as err:
                 msg = (
-                    f"Character data in variable {self.name!r} could not be decoded"
+                    f"Character data in variable {self.name!r} could not be decoded "
                     f"with the {encoding!r} encoding.  This can be fixed by setting the "
                     "variable '_Encoding' attribute to suit the content."
                 )
@@ -188,6 +189,7 @@ def __getitem__(self, keys):
         return data
 
     def __setitem__(self, keys, data):
+        data = np.asanyarray(data)
         if self._is_chardata():
             # N.B. we never need to UNset this, as we totally control it
             self._contained_instance.set_auto_chartostring(False)
diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
index e77427cd63..5fa021ccdd 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -146,12 +146,17 @@ Then, as regards the _Encoding ..
 
 TO TEST...
 ==========
-create a dataset + write char data
-  - X assign different encodings: makes no difference
+NOTE on length control:
+    - not an API thing, it's implicit from when you create a variable
+    - this also applies to how it loads back
+        - BUT here there may be scope for a control attribute :
 
-create a dataset + write STRING data
-  - X encoding=(ascii, utf-8, utf-32, None)
-  - X withnonascii=(T, F)
++++ create a dataset + write char data
++++   - X assign different encodings: makes no difference
+
++++ create a dataset + write STRING data
++++   - X encoding=(ascii, utf-8, utf-32, None)
++++   - X withnonascii=(T, F)
   - X length=(long, short, none)
 
 read string data
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 092da19a00..411212b973 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -9,11 +9,7 @@
 import numpy as np
 import pytest
 
-from iris.fileformats.netcdf._bytecoding_datasets import (
-    EncodedDataset,
-    encode_stringarray_as_bytearray,
-    flexi_encode_stringarray_as_bytearray,
-)
+from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
 from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
 
 encoding_options = [None, "ascii", "utf-8", "utf-32"]
@@ -66,8 +62,92 @@ def fetch_undecoded_var(path, varname):
     return v
 
 
+def check_raw_content(path, varname, expected_byte_array):
+    v = fetch_undecoded_var(path, varname)
+    bytes_result = v[:]
+    assert (
+        bytes_result.shape == expected_byte_array.shape
+        and bytes_result.dtype == expected_byte_array.dtype
+        and np.all(bytes_result == expected_byte_array)
+    )
+
+
+def _make_bytearray_inner(data, encoding):
+    # Convert to a (list of [lists of..]) strings or bytes to a
+    #  (list of [lists of..]) length-1 bytes with an extra dimension.
+    if isinstance(data, str):
+        # Convert input strings to bytes
+        data = data.encode(encoding)
+    if isinstance(data, bytes):
+        # iterate over bytes to get a sequence of length-1 bytes (what np.array wants)
+        result = [data[i : i + 1] for i in range(len(data))]
+    else:
+        # If not string/bytes, expect the input to be a list.
+        # N.B. the recursion is inefficient, but we don't care about that here
+        result = [_make_bytearray_inner(part, encoding) for part in data]
+    return result
+
+
+def make_bytearray(data, encoding="ascii"):
+    """Convert bytes or lists of bytes into a numpy byte array.
+
+    This is largely to avoid using "encode_stringarray_as_bytearray", since we don't
+    want to depend on that when we should be testing it.
+    So, it mostly replicates the function of that, but it does also support bytes in the
+    input, and it automatically finds + applies the maximum bytes-lengths in the input.
+    """
+    # First, Convert to a (list of [lists of]..) length-1 bytes objects
+    data = _make_bytearray_inner(data, encoding)
+
+    # Numbers of bytes in the inner dimension are the lengths of bytes/strings input,
+    #  so they aren't all the same.
+    # To enable array conversion, we fix that by expanding all to the max length
+
+    def get_maxlen(data):
+        # Find the maximum number of bytes in the inner dimension.
+        if not isinstance(data, list):
+            # Inner bytes object
+            assert isinstance(data, bytes)
+            longest = len(data)
+        else:
+            # We have a list: either a list of bytes, or a list of lists.
+            if len(data) == 0 or not isinstance(data[0], list):
+                # inner-most list, should contain bytes if anything
+                assert len(data) == 0 or isinstance(data[0], bytes)
+                # return n-bytes
+                longest = len(data)
+            else:
+                # list of lists: return max length of sub-lists
+                longest = max(get_maxlen(part) for part in data)
+        return longest
+
+    maxlen = get_maxlen(data)
+
+    def extend_all_to_maxlen(data, length, filler=b"\0"):
+        # Extend each "innermost" list (of single bytes) to the required length
+        if isinstance(data, list):
+            if len(data) == 0 or not isinstance(data[0], list):
+                # Pad all the inner-most lists to the required number of elements
+                n_extra = length - len(data)
+                if n_extra > 0:
+                    data = data + [filler] * n_extra
+            else:
+                data = [extend_all_to_maxlen(part, length, filler) for part in data]
+        return data
+
+    data = extend_all_to_maxlen(data, maxlen)
+    # We should now be able to create an array of single bytes.
+    result = np.array(data)
+    assert result.dtype == "<S1"
+    return result
+
+
 class TestWriteStrings:
-    """Test how string data is saved to a file."""
+    """Test how string data is saved to a file.
+
+    Mostly, we read back data as a "normal" dataset to avoid relying on the read code,
+    which is separately tested -- see 'TestReadStrings'.
+    """
 
     def test_write_strings(self, encoding, tempdir):
         # Create a dataset with the variable
@@ -91,18 +171,11 @@ def test_write_strings(self, encoding, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        v = fetch_undecoded_var(path, "vxs")
-
-        # Check that the raw result is as expected
-        bytes_result = v[:]
-        expected = encode_stringarray_as_bytearray(writedata, write_encoding, strlen)
-        assert (
-            bytes_result.shape == expected.shape
-            and bytes_result.dtype == expected.dtype
-            and np.all(bytes_result == expected)
-        )
+        expected_bytes = make_bytearray(writedata, write_encoding)
+        check_raw_content(path, "vxs", expected_bytes)
 
-        # Check that the "_Encoding" property is also as expected
+        # Check also that the "_Encoding" property is as expected
+        v = fetch_undecoded_var(path, "vxs")
         result_attr = v.getncattr("_Encoding") if "_Encoding" in v.ncattrs() else None
         assert result_attr == encoding
 
@@ -118,15 +191,8 @@ def test_scalar(self, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        v = fetch_undecoded_var(path, "v0_scalar")
-        result = v[:]
-
-        # Check that the raw result is as expected
-        assert (
-            result.shape == (5,)
-            and result.dtype == "<S1"
-            and np.all(result == [b"s", b"t", b"u", b"f", b"f"])
-        )
+        expected_bytes = make_bytearray(b"stuff")
+        check_raw_content(path, "v0_scalar", expected_bytes)
 
     def test_multidim(self, tempdir):
         # Like 'test_write_strings', but the variable has additional dimensions.
@@ -145,34 +211,16 @@ def test_multidim(self, tempdir):
         )
 
         # Check that we *can* write a multidimensional string array
-        test_data = np.array(
-            [
-                ["one", "n", ""],
-                ["two", "xxxxx", "four"],
-            ],
-            dtype="U5",
-        )
+        test_data = [
+            ["one", "n", ""],
+            ["two", "xxxxx", "four"],
+        ]
         v[:] = test_data
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        v = fetch_undecoded_var(path, "vyxn")
-        result = v[:]
-
-        # Check that the raw result is as expected
-        expected_bytes = encode_stringarray_as_bytearray(
-            test_data, encoding="ascii", string_dimension_length=5
-        )
-        assert (
-            result.shape
-            == (
-                2,
-                3,
-                5,
-            )
-            and result.dtype == "<S1"
-            and np.all(result == expected_bytes)
-        )
+        expected_bytes = make_bytearray(test_data)
+        check_raw_content(path, "vyxn", expected_bytes)
 
     def test_write_encoding_failure(self, tempdir):
         path = tempdir / "test_writestrings_encoding_failure.nc"
@@ -185,8 +233,38 @@ def test_write_encoding_failure(self, tempdir):
         with pytest.raises(ValueError, match=msg):
             v[:] = samples_3_nonascii
 
-    def test_overlength_warning(self):
-        pass
+    def test_overlength(self, tempdir):
+        # Check expected behaviour with over-length data
+        path = tempdir / "test_writestrings_overlength.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding="ascii")
+        v = ds.variables["vxs"]
+        v[:] = ["1", "123456789", "two"]
+        expected_bytes = make_bytearray(["1", "12345", "two"])
+        check_raw_content(path, "vxs", expected_bytes)
+
+    def test_overlength_splitcoding(self, tempdir):
+        # Check expected behaviour when non-ascii multibyte coding gets truncated
+        path = tempdir / "test_writestrings_overlength_splitcoding.nc"
+        ds = make_encoded_dataset(path, strlen=5, encoding="utf-8")
+        v = ds.variables["vxs"]
+        v[:] = ["1", "1234ü", "two"]
+        # This creates a problem: it won't read back
+        msg = (
+            "Character data in variable 'vxs' could not be decoded "
+            "with the 'utf-8' encoding."
+        )
+        with pytest.raises(ValueError, match=msg):
+            v[:]
+
+        # Check also that we *can* read the raw content.
+        ds.close()
+        expected_bytes = [
+            b"1",
+            b"1234\xc3",  # NOTE: truncated encoding
+            b"two",
+        ]
+        expected_bytearray = make_bytearray(expected_bytes)
+        check_raw_content(path, "vxs", expected_bytearray)
 
 
 class TestWriteChars:
@@ -194,9 +272,7 @@ class TestWriteChars:
     def test_write_chars(self, tempdir, write_form):
         encoding = "utf-8"
         write_strings = samples_3_nonascii
-        write_bytes = flexi_encode_stringarray_as_bytearray(
-            write_strings, encoding=encoding
-        )
+        write_bytes = make_bytearray(write_strings, encoding=encoding)
         # NOTE: 'flexi' form util decides the width needs to be 7 !!
         strlen = write_bytes.shape[-1]
         path = tempdir / f"test_writechars_{write_form}.nc"
@@ -210,12 +286,8 @@ def test_write_chars(self, tempdir, write_form):
             v[:] = write_bytes
 
         # .. the result should be the same
-        result = v[:]
-        assert (
-            result.shape == write_strings.shape
-            and result.dtype == f"<U{strlen}"  # NOTE: we fixed the string width
-            and np.all(result == write_strings)
-        )
+        ds.close()
+        check_raw_content(path, "vxs", write_bytes)
 
 
 class TestReadStrings:

From 6d6c6fad3a3df2d31f917ff3bd2959c3ecf6e977 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 14:47:54 +0000
Subject: [PATCH 4/7] Radically simplify 'make_bytesarray', by using a known
 specified bytewidth.

---
 .../netcdf/test_bytecoding_datasets.py        | 76 ++++++-------------
 1 file changed, 22 insertions(+), 54 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 411212b973..9ef354f850 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -72,7 +72,7 @@ def check_raw_content(path, varname, expected_byte_array):
     )
 
 
-def _make_bytearray_inner(data, encoding):
+def _make_bytearray_inner(data, bytewidth, encoding):
     # Convert to a (list of [lists of..]) strings or bytes to a
     #  (list of [lists of..]) length-1 bytes with an extra dimension.
     if isinstance(data, str):
@@ -81,61 +81,25 @@ def _make_bytearray_inner(data, encoding):
     if isinstance(data, bytes):
         # iterate over bytes to get a sequence of length-1 bytes (what np.array wants)
         result = [data[i : i + 1] for i in range(len(data))]
+        # pad or truncate everything to the required bytewidth
+        result = (result + [b"\0"] * bytewidth)[:bytewidth]
     else:
         # If not string/bytes, expect the input to be a list.
         # N.B. the recursion is inefficient, but we don't care about that here
-        result = [_make_bytearray_inner(part, encoding) for part in data]
+        result = [_make_bytearray_inner(part, bytewidth, encoding) for part in data]
     return result
 
 
-def make_bytearray(data, encoding="ascii"):
+def make_bytearray(data, bytewidth, encoding="ascii"):
     """Convert bytes or lists of bytes into a numpy byte array.
 
     This is largely to avoid using "encode_stringarray_as_bytearray", since we don't
     want to depend on that when we should be testing it.
     So, it mostly replicates the function of that, but it does also support bytes in the
-    input, and it automatically finds + applies the maximum bytes-lengths in the input.
+    input.
     """
     # First, Convert to a (list of [lists of]..) length-1 bytes objects
-    data = _make_bytearray_inner(data, encoding)
-
-    # Numbers of bytes in the inner dimension are the lengths of bytes/strings input,
-    #  so they aren't all the same.
-    # To enable array conversion, we fix that by expanding all to the max length
-
-    def get_maxlen(data):
-        # Find the maximum number of bytes in the inner dimension.
-        if not isinstance(data, list):
-            # Inner bytes object
-            assert isinstance(data, bytes)
-            longest = len(data)
-        else:
-            # We have a list: either a list of bytes, or a list of lists.
-            if len(data) == 0 or not isinstance(data[0], list):
-                # inner-most list, should contain bytes if anything
-                assert len(data) == 0 or isinstance(data[0], bytes)
-                # return n-bytes
-                longest = len(data)
-            else:
-                # list of lists: return max length of sub-lists
-                longest = max(get_maxlen(part) for part in data)
-        return longest
-
-    maxlen = get_maxlen(data)
-
-    def extend_all_to_maxlen(data, length, filler=b"\0"):
-        # Extend each "innermost" list (of single bytes) to the required length
-        if isinstance(data, list):
-            if len(data) == 0 or not isinstance(data[0], list):
-                # Pad all the inner-most lists to the required number of elements
-                n_extra = length - len(data)
-                if n_extra > 0:
-                    data = data + [filler] * n_extra
-            else:
-                data = [extend_all_to_maxlen(part, length, filler) for part in data]
-        return data
-
-    data = extend_all_to_maxlen(data, maxlen)
+    data = _make_bytearray_inner(data, bytewidth, encoding)
     # We should now be able to create an array of single bytes.
     result = np.array(data)
     assert result.dtype == "<S1"
@@ -171,7 +135,7 @@ def test_write_strings(self, encoding, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        expected_bytes = make_bytearray(writedata, write_encoding)
+        expected_bytes = make_bytearray(writedata, strlen, write_encoding)
         check_raw_content(path, "vxs", expected_bytes)
 
         # Check also that the "_Encoding" property is as expected
@@ -183,7 +147,8 @@ def test_scalar(self, tempdir):
         # Like 'test_write_strings', but the variable has *only* the string dimension.
         path = tempdir / "test_writestrings_scalar.nc"
 
-        ds_encoded = make_encoded_dataset(path, strlen=5)
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
         v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",))
 
         # Checks that we *can* write a string
@@ -191,14 +156,15 @@ def test_scalar(self, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        expected_bytes = make_bytearray(b"stuff")
+        expected_bytes = make_bytearray(b"stuff", strlen)
         check_raw_content(path, "v0_scalar", expected_bytes)
 
     def test_multidim(self, tempdir):
         # Like 'test_write_strings', but the variable has additional dimensions.
         path = tempdir / "test_writestrings_multidim.nc"
 
-        ds_encoded = make_encoded_dataset(path, strlen=5)
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
         ds_encoded.createDimension("y", 2)
         v = ds_encoded.createVariable(
             "vyxn",
@@ -219,7 +185,7 @@ def test_multidim(self, tempdir):
 
         # Close, re-open as an "ordinary" dataset, and check the raw content.
         ds_encoded.close()
-        expected_bytes = make_bytearray(test_data)
+        expected_bytes = make_bytearray(test_data, strlen)
         check_raw_content(path, "vyxn", expected_bytes)
 
     def test_write_encoding_failure(self, tempdir):
@@ -236,16 +202,18 @@ def test_write_encoding_failure(self, tempdir):
     def test_overlength(self, tempdir):
         # Check expected behaviour with over-length data
         path = tempdir / "test_writestrings_overlength.nc"
-        ds = make_encoded_dataset(path, strlen=5, encoding="ascii")
+        strlen = 5
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
         v = ds.variables["vxs"]
         v[:] = ["1", "123456789", "two"]
-        expected_bytes = make_bytearray(["1", "12345", "two"])
+        expected_bytes = make_bytearray(["1", "12345", "two"], strlen)
         check_raw_content(path, "vxs", expected_bytes)
 
     def test_overlength_splitcoding(self, tempdir):
         # Check expected behaviour when non-ascii multibyte coding gets truncated
         path = tempdir / "test_writestrings_overlength_splitcoding.nc"
-        ds = make_encoded_dataset(path, strlen=5, encoding="utf-8")
+        strlen = 5
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="utf-8")
         v = ds.variables["vxs"]
         v[:] = ["1", "1234ü", "two"]
         # This creates a problem: it won't read back
@@ -263,7 +231,7 @@ def test_overlength_splitcoding(self, tempdir):
             b"1234\xc3",  # NOTE: truncated encoding
             b"two",
         ]
-        expected_bytearray = make_bytearray(expected_bytes)
+        expected_bytearray = make_bytearray(expected_bytes, strlen)
         check_raw_content(path, "vxs", expected_bytearray)
 
 
@@ -272,9 +240,9 @@ class TestWriteChars:
     def test_write_chars(self, tempdir, write_form):
         encoding = "utf-8"
         write_strings = samples_3_nonascii
-        write_bytes = make_bytearray(write_strings, encoding=encoding)
+        strlen = strings_maxbytes(write_strings, encoding)
+        write_bytes = make_bytearray(write_strings, strlen, encoding=encoding)
         # NOTE: 'flexi' form util decides the width needs to be 7 !!
-        strlen = write_bytes.shape[-1]
         path = tempdir / f"test_writechars_{write_form}.nc"
         ds = make_encoded_dataset(path, encoding=encoding, strlen=strlen)
         v = ds.variables["vxs"]

From 4b17638429725c3be1b1108ef782e5e69bf13209 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 16:23:55 +0000
Subject: [PATCH 5/7] Add read tests.

---
 .../netcdf/_bytecoding_datasets.py            |  38 +++-
 .../netcdf/test_bytecoding_datasets.py        | 165 ++++++++++++++++--
 2 files changed, 184 insertions(+), 19 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 62e1dd2ab7..3bdc799d7f 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -175,7 +175,7 @@ def __getitem__(self, keys):
         if DECODE_TO_STRINGS_ON_READ and self._is_chardata():
             encoding = self._get_encoding() or DEFAULT_READ_ENCODING
             # N.B. typically, read encoding default is UTF-8 --> a "usually safe" choice
-            strlen = self._get_string_length()
+            strlen = self._get_string_width()
             try:
                 data = decode_bytesarray_to_stringarray(data, encoding, strlen)
             except UnicodeDecodeError as err:
@@ -194,11 +194,11 @@ def __setitem__(self, keys, data):
             # N.B. we never need to UNset this, as we totally control it
             self._contained_instance.set_auto_chartostring(False)
 
-            encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
             # N.B. typically, write encoding default is "ascii" --> fails bad content
             if data.dtype.kind == "U":
                 try:
-                    strlen = self._get_string_length()
+                    encoding = self._get_encoding() or DEFAULT_WRITE_ENCODING
+                    strlen = self._get_byte_width()
                     data = encode_stringarray_as_bytearray(data, encoding, strlen)
                 except UnicodeEncodeError as err:
                     msg = (
@@ -230,12 +230,36 @@ def _get_encoding(self) -> str | None:
 
         return result
 
-    def _get_string_length(self):
+    def _get_byte_width(self) -> int | None:
+        if not hasattr(self, "_bytewidth"):
+            n_bytes = self.group().dimensions[self.dimensions[-1]].size
+            # Cache this length control on the variable -- but not as a netcdf attribute
+            self.__dict__["_bytewidth"] = n_bytes
+
+        return self.__dict__["_bytewidth"]
+
+    def _get_string_width(self):
         """Return the string-length defined for this variable."""
         if not hasattr(self, "_strlen"):
-            # Work out the string length from the parent dataset dimensions.
-            strlen = self.group().dimensions[self.dimensions[-1]].size
-            # Cache this on the variable -- but not as a netcdf attribute (!)
+            if hasattr(self, "iris_string_width"):
+                strlen = self.get_ncattr("iris_string_width")
+            else:
+                # Work out the actual byte width from the parent dataset dimensions.
+                strlen = self._get_byte_width()
+                # Convert the string dimension length (i.e. bytes) to a sufficiently-long
+                #  string width, depending on the encoding used.
+                encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+                # regularise the name for comparison with recognised ones
+                encoding = codecs.lookup(encoding).name
+                if "utf-16" in encoding:
+                    # Each char needs at least 2 bytes -- including a terminator char
+                    strlen = (strlen // 2) - 1
+                elif "utf-32" in encoding:
+                    # Each char needs exactly 4 bytes -- including a terminator char
+                    strlen = (strlen // 4) - 1
+                # "ELSE": assume there can be (at most) as many chars as bytes
+
+            # Cache this length control on the variable -- but not as a netcdf attribute
             self.__dict__["_strlen"] = strlen
 
         return self._strlen
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
index 9ef354f850..5df511103f 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
+++ b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py
@@ -9,7 +9,10 @@
 import numpy as np
 import pytest
 
-from iris.fileformats.netcdf._bytecoding_datasets import EncodedDataset
+from iris.fileformats.netcdf._bytecoding_datasets import (
+    DECODE_TO_STRINGS_ON_READ,
+    EncodedDataset,
+)
 from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper
 
 encoding_options = [None, "ascii", "utf-8", "utf-32"]
@@ -62,14 +65,17 @@ def fetch_undecoded_var(path, varname):
     return v
 
 
+def check_array_matching(arr1, arr2):
+    """Check for arrays matching shape, dtype and content."""
+    assert (
+        arr1.shape == arr2.shape and arr1.dtype == arr2.dtype and np.all(arr1 == arr2)
+    )
+
+
 def check_raw_content(path, varname, expected_byte_array):
     v = fetch_undecoded_var(path, varname)
     bytes_result = v[:]
-    assert (
-        bytes_result.shape == expected_byte_array.shape
-        and bytes_result.dtype == expected_byte_array.dtype
-        and np.all(bytes_result == expected_byte_array)
-    )
+    check_array_matching(bytes_result, expected_byte_array)
 
 
 def _make_bytearray_inner(data, bytewidth, encoding):
@@ -102,7 +108,7 @@ def make_bytearray(data, bytewidth, encoding="ascii"):
     data = _make_bytearray_inner(data, bytewidth, encoding)
     # We should now be able to create an array of single bytes.
     result = np.array(data)
-    assert result.dtype == "<S1"
+    assert result.dtype == "S1"
     return result
 
 
@@ -113,7 +119,7 @@ class TestWriteStrings:
     which is separately tested -- see 'TestReadStrings'.
     """
 
-    def test_write_strings(self, encoding, tempdir):
+    def test_encodings(self, encoding, tempdir):
         # Create a dataset with the variable
         path = tempdir / f"test_writestrings_encoding_{encoding!s}.nc"
 
@@ -258,8 +264,143 @@ def test_write_chars(self, tempdir, write_form):
         check_raw_content(path, "vxs", write_bytes)
 
 
-class TestReadStrings:
-    """Test how character data is read and converted to strings."""
+class TestRead:
+    """Test how character data is read and converted to strings.
+
+    N.B. many testcases here parallel the 'TestWriteStrings' : we are creating test
+    datafiles with 'make_dataset' and assigning raw bytes, as-per 'TestWriteChars'.
+
+    We are mostly checking here that reading back produces string arrays as expected.
+    However, it is simple + convenient to also check the 'DECODE_TO_STRINGS_ON_READ'
+    function here, i.e. "raw" bytes reads.  So that is also done in this class.
+    """
+
+    @pytest.fixture(params=["strings", "bytes"])
+    def readmode(self, request):
+        return request.param
+
+    def test_encodings(self, encoding, tempdir, readmode):
+        # Create a dataset with the variable
+        path = tempdir / f"test_read_encodings_{encoding!s}_{readmode}.nc"
+
+        if encoding in [None, "ascii"]:
+            write_strings = samples_3_ascii
+            write_encoding = "ascii"
+        else:
+            write_strings = samples_3_nonascii
+            write_encoding = encoding
+
+        write_strings = write_strings.copy()  # just for safety?
+        strlen = strings_maxbytes(write_strings, write_encoding)
+        write_bytes = make_bytearray(write_strings, strlen, encoding=write_encoding)
+
+        ds_encoded = make_encoded_dataset(path, strlen, encoding)
+        v = ds_encoded.variables["vxs"]
+        v[:] = write_bytes
+
+        if readmode == "strings":
+            # Test "normal" read --> string array
+            result = v[:]
+            expected = write_strings
+            if encoding == "utf-8":
+                # In this case, with the given non-ascii sample data, the
+                #  "default minimum string length" is overestimated.
+                assert strlen == 7 and result.dtype == "U7"
+                # correct the result dtype to pass the write_strings comparison below
+                truncated_result = result.astype("U4")
+                # Also check that content is the same (i.e. not actually truncated)
+                assert np.all(truncated_result == result)
+                result = truncated_result
+        else:
+            # Test "raw" read --> byte array
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]
+            expected = write_bytes
+
+        check_array_matching(result, expected)
+
+    def test_scalar(self, tempdir, readmode):
+        # Like 'test_write_strings', but the variable has *only* the string dimension.
+        path = tempdir / f"test_read_scalar_{readmode}.nc"
+
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
+        v = ds_encoded.createVariable("v0_scalar", "S1", ("strlen",))
+
+        data_string = "stuff"
+        data_bytes = make_bytearray(data_string, 5)
+
+        # Checks that we *can* write a string
+        v[:] = data_bytes
+
+        if readmode == "strings":
+            # Test "normal" read --> string array
+            result = v[:]
+            expected = np.array(data_string)
+        else:
+            # Test "raw" read --> byte array
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]
+            expected = data_bytes
+
+        check_array_matching(result, expected)
+
+    def test_multidim(self, tempdir, readmode):
+        # Like 'test_write_strings', but the variable has additional dimensions.
+        path = tempdir / f"test_read_multidim_{readmode}.nc"
+
+        strlen = 5
+        ds_encoded = make_encoded_dataset(path, strlen=strlen)
+        ds_encoded.createDimension("y", 2)
+        v = ds_encoded.createVariable(
+            "vyxn",
+            "S1",
+            (
+                "y",
+                "x",
+                "strlen",
+            ),
+        )
+
+        # Check that we *can* write a multidimensional string array
+        test_strings = [
+            ["one", "n", ""],
+            ["two", "xxxxx", "four"],
+        ]
+        test_bytes = make_bytearray(test_strings, strlen)
+        v[:] = test_bytes
+
+        if readmode == "strings":
+            # Test "normal" read --> string array
+            result = v[:]
+            expected = np.array(test_strings)
+        else:
+            # Test "raw" read --> byte array
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]
+            expected = test_bytes
+
+        check_array_matching(result, expected)
+
+    def test_read_encoding_failure(self, tempdir, readmode):
+        path = tempdir / f"test_read_encoding_failure_{readmode}.nc"
+        strlen = 10
+        ds = make_encoded_dataset(path, strlen=strlen, encoding="ascii")
+        v = ds.variables["vxs"]
+        test_utf8_bytes = make_bytearray(
+            samples_3_nonascii, bytewidth=strlen, encoding="utf-8"
+        )
+        v[:] = test_utf8_bytes
+
+        if readmode == "strings":
+            msg = (
+                "Character data in variable 'vxs' could not be decoded "
+                "with the 'ascii' encoding."
+            )
+            with pytest.raises(ValueError, match=msg):
+                v[:]
+        else:
+            with DECODE_TO_STRINGS_ON_READ.context(False):
+                result = v[:]  # this ought to be ok!
 
-    def test_encodings(self, encoding):
-        pass
+            assert np.all(result == test_utf8_bytes)

From 046183bb97e6d037c2ebee74a938f98834fc7753 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 16:26:13 +0000
Subject: [PATCH 6/7] Remove iris width control (not in this layer).

---
 .../netcdf/_bytecoding_datasets.py            | 31 +++++++++----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
index 3bdc799d7f..5ed156f3ee 100644
--- a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
+++ b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -241,23 +241,20 @@ def _get_byte_width(self) -> int | None:
     def _get_string_width(self):
         """Return the string-length defined for this variable."""
         if not hasattr(self, "_strlen"):
-            if hasattr(self, "iris_string_width"):
-                strlen = self.get_ncattr("iris_string_width")
-            else:
-                # Work out the actual byte width from the parent dataset dimensions.
-                strlen = self._get_byte_width()
-                # Convert the string dimension length (i.e. bytes) to a sufficiently-long
-                #  string width, depending on the encoding used.
-                encoding = self._get_encoding() or DEFAULT_READ_ENCODING
-                # regularise the name for comparison with recognised ones
-                encoding = codecs.lookup(encoding).name
-                if "utf-16" in encoding:
-                    # Each char needs at least 2 bytes -- including a terminator char
-                    strlen = (strlen // 2) - 1
-                elif "utf-32" in encoding:
-                    # Each char needs exactly 4 bytes -- including a terminator char
-                    strlen = (strlen // 4) - 1
-                # "ELSE": assume there can be (at most) as many chars as bytes
+            # Work out the actual byte width from the parent dataset dimensions.
+            strlen = self._get_byte_width()
+            # Convert the string dimension length (i.e. bytes) to a sufficiently-long
+            #  string width, depending on the encoding used.
+            encoding = self._get_encoding() or DEFAULT_READ_ENCODING
+            # regularise the name for comparison with recognised ones
+            encoding = codecs.lookup(encoding).name
+            if "utf-16" in encoding:
+                # Each char needs at least 2 bytes -- including a terminator char
+                strlen = (strlen // 2) - 1
+            elif "utf-32" in encoding:
+                # Each char needs exactly 4 bytes -- including a terminator char
+                strlen = (strlen // 4) - 1
+            # "ELSE": assume there can be (at most) as many chars as bytes
 
             # Cache this length control on the variable -- but not as a netcdf attribute
             self.__dict__["_strlen"] = strlen

From 2002c2a29a63751e07cd0e7d061c45c5031d10d4 Mon Sep 17 00:00:00 2001
From: Patrick Peglar <patrick.peglar@metoffice.gov.uk>
Date: Fri, 5 Dec 2025 17:55:12 +0000
Subject: [PATCH 7/7] more notes

---
 .../fileformats/netcdf/encoding_tests.txt     | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
index 5fa021ccdd..07a0bc3bcd 100644
--- a/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
+++ b/lib/iris/tests/unit/fileformats/netcdf/encoding_tests.txt
@@ -146,25 +146,21 @@ Then, as regards the _Encoding ..
 
 TO TEST...
 ==========
-NOTE on length control:
-    - not an API thing, it's implicit from when you create a variable
-    - this also applies to how it loads back
-        - BUT here there may be scope for a control attribute :
-
 +++ create a dataset + write char data
 +++   - X assign different encodings: makes no difference
 
 +++ create a dataset + write STRING data
 +++   - X encoding=(ascii, utf-8, utf-32, None)
 +++   - X withnonascii=(T, F)
-  - X length=(long, short, none)
+XXXX  - X length=(long, short, none)
+        ***deferred*** to layer above only
 
-read string data
-    - X encoding=(ascii, utf-8, utf-32, None)
-    - X withnonascii=(T, F)
++++ read string data
++++     - X encoding=(ascii, utf-8, utf-32, None)
++++     - X withnonascii=(T, F)
 
-read char data (with control)
-  - X different encodings: make no difference
++++ read char data (with control)
++++   - X different encodings: make no difference
 
 ==rethought==
 write strings
@@ -185,3 +181,11 @@ write char data
 read char data
     - X encodings: don't matter
 
+---
+NOTEs on length control:
+not an API thing, it's implicit from when you create a variable
+this also applies to how it loads back
+BUT here there may be scope for a control attribute :
+  "iris_string_dim" - controls width on creation + reading back
+
+