First proper testing (reads working).

pp-mo · pp-mo · commit c4b793604d71 · 2026-01-21T18:41:40.000Z
diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py
@@ -811,39 +811,40 @@ def cf_label_data(self, cf_data_var):
                 % self.cf_name
             )
 
-        label_data = self[:]
-
-        if ma.isMaskedArray(label_data):
-            label_data = label_data.filled(b"\0")
-
-        # Determine whether we have a string-valued scalar label
-        # i.e. a character variable that only has one dimension (the length of the string).
-        if self.ndim == 1:
-            label_string = b"".join(label_data).strip()
-            label_string = label_string.decode("utf8")
-            data = np.array([label_string])
-        else:
-            # Determine the index of the string dimension.
-            str_dim = self.dimensions.index(str_dim_name)
-
-            # Calculate new label data shape (without string dimension) and create payload array.
-            new_shape = tuple(
-                dim_len for i, dim_len in enumerate(self.shape) if i != str_dim
-            )
-            string_basetype = "|U%d"
-            string_dtype = string_basetype % self.shape[str_dim]
-            data = np.empty(new_shape, dtype=string_dtype)
-
-            for index in np.ndindex(new_shape):
-                # Create the slice for the label data.
-                if str_dim == 0:
-                    label_index = (slice(None, None),) + index
-                else:
-                    label_index = index + (slice(None, None),)
-
-                label_string = b"".join(label_data[label_index]).strip()
-                label_string = label_string.decode("utf8")
-                data[index] = label_string
+        data = self[:]
+        # label_data = self[:]
+        #
+        # if ma.isMaskedArray(label_data):
+        #     label_data = label_data.filled(b"\0")
+        #
+        # # Determine whether we have a string-valued scalar label
+        # # i.e. a character variable that only has one dimension (the length of the string).
+        # if self.ndim == 1:
+        #     label_string = b"".join(label_data).strip()
+        #     label_string = label_string.decode("utf8")
+        #     data = np.array([label_string])
+        # else:
+        #     # Determine the index of the string dimension.
+        #     str_dim = self.dimensions.index(str_dim_name)
+        #
+        #     # Calculate new label data shape (without string dimension) and create payload array.
+        #     new_shape = tuple(
+        #         dim_len for i, dim_len in enumerate(self.shape) if i != str_dim
+        #     )
+        #     string_basetype = "|U%d"
+        #     string_dtype = string_basetype % self.shape[str_dim]
+        #     data = np.empty(new_shape, dtype=string_dtype)
+        #
+        #     for index in np.ndindex(new_shape):
+        #         # Create the slice for the label data.
+        #         if str_dim == 0:
+        #             label_index = (slice(None, None),) + index
+        #         else:
+        #             label_index = index + (slice(None, None),)
+        #
+        #         label_string = b"".join(label_data[label_index]).strip()
+        #         label_string = label_string.decode("utf8")
+        #         data[index] = label_string
 
         return data
 
diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -0,0 +1,193 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Integration tests for various uses of character/string arrays in netcdf file variables.
+
+This covers both the loading and saving of variables which are the content of
+data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures.
+"""
+
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import iris
+from iris.fileformats.netcdf import _thread_safe_nc
+
+N_XDIM = 3
+N_CHARS_DIM = 64
+COORD_ON_SEPARATE_DIM = True
+PERSIST_TESTFILES = "~/chararray_testfiles"
+
+
+NO_ENCODING_STR = "<noencoding>"
+TEST_ENCODINGS = [
+    NO_ENCODING_STR,
+    "ascii",
+    "utf-8",
+    # "iso8859-1",  # a common one-byte-per-char "codepage" type
+    # "utf-16",
+    "utf-32",
+]
+
+
+#
+# Routines to convert between byte and string arrays.
+# Independently defined here, to avoid relying on any code we are testing.
+#
+def convert_strings_to_chararray(
+    string_array_1d: np.ndarray, maxlen: int, encoding: str | None = None
+):
+    # Note: this is limited to 1-D arrays of strings.
+    # Could generalise that if needed, but for now this makes it simpler.
+    if encoding is None:
+        encoding = "ascii"
+    bbytes = [text.encode(encoding) for text in string_array_1d]
+    pad = b"\0" * maxlen
+    bbytes = [(x + pad)[:maxlen] for x in bbytes]
+    chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
+    return chararray
+
+
+def convert_bytearray_to_strings(
+    byte_array, encoding="utf-8", string_length: int | None = None
+):
+    """Convert bytes to strings.
+
+    N.B. for now at least, we assume the string dim is **always the last one**.
+    """
+    bytes_shape = byte_array.shape
+    var_shape = bytes_shape[:-1]
+    if string_length is None:
+        string_length = bytes_shape[-1]
+    string_dtype = f"U{string_length}"
+    result = np.empty(var_shape, dtype=string_dtype)
+    for ndindex in np.ndindex(var_shape):
+        element_bytes = byte_array[ndindex]
+        bytes = b"".join([b if b else b"\0" for b in element_bytes])
+        string = bytes.decode(encoding)
+        result[ndindex] = string
+    return result
+
+
+def make_testfile(testfile_path: Path, encoding_str: str):
+    """Create a test netcdf file.
+
+    Also returns content strings (unicode or ascii versions).
+    """
+    if encoding_str == NO_ENCODING_STR:
+        encoding = None
+    else:
+        encoding = encoding_str
+
+    data_is_ascii = encoding in (None, "ascii")
+
+    if data_is_ascii:
+        coordvar_strings = ["mOnster", "London", "Amsterdam"]
+        datavar_strings = ["bun", "Eclair", "sandwich"]
+    else:
+        coordvar_strings = ["Münster", "London", "Amsterdam"]
+        datavar_strings = ["bun", "éclair", "sandwich"]
+
+    coordvar_bytearray = convert_strings_to_chararray(
+        string_array_1d=coordvar_strings, maxlen=N_CHARS_DIM, encoding=encoding
+    )
+    datavar_bytearray = convert_strings_to_chararray(
+        string_array_1d=datavar_strings, maxlen=N_CHARS_DIM, encoding=encoding
+    )
+
+    ds = _thread_safe_nc.DatasetWrapper(testfile_path, "w")
+    try:
+        ds.createDimension("x", N_XDIM)
+        ds.createDimension("nstr", N_CHARS_DIM)
+        if COORD_ON_SEPARATE_DIM:
+            ds.createDimension("nstr2", N_CHARS_DIM)
+        v_xdim = ds.createVariable("x", int, dimensions=("x"))
+        v_xdim[:] = np.arange(N_XDIM)
+
+        v_co = ds.createVariable(
+            "v_co",
+            "S1",
+            dimensions=(
+                "x",
+                "nstr2" if COORD_ON_SEPARATE_DIM else "nstr",
+            ),
+        )
+        v_co[:] = coordvar_bytearray
+
+        if encoding is not None:
+            v_co._Encoding = encoding
+
+        v_numeric = ds.createVariable(
+            "v_numeric",
+            float,
+            dimensions=("x",),
+        )
+        v_numeric[:] = np.arange(N_XDIM)
+
+        v_datavar = ds.createVariable(
+            "v",
+            "S1",
+            dimensions=(
+                "x",
+                "nstr",
+            ),
+        )
+        v_datavar[:] = datavar_bytearray
+
+        if encoding is not None:
+            v_datavar._Encoding = encoding
+
+        v_datavar.coordinates = "v_co v_numeric"
+    finally:
+        ds.close()
+
+    return testfile_path, coordvar_strings, datavar_strings
+
+
+@pytest.fixture(params=TEST_ENCODINGS)
+def encoding(request):
+    return request.param
+
+
+class TestReadEncodings:
+    """Test loading of testfiles with encoded string data."""
+
+    @pytest.fixture()
+    def testdata(self, encoding, tmp_path):
+        """Create a suitable valid testfile, and return expected string content."""
+        if PERSIST_TESTFILES:
+            tmp_path = Path(PERSIST_TESTFILES).expanduser()
+        if encoding == "<noencoding>":
+            filetag = "noencoding"
+        else:
+            filetag = encoding
+        tempfile_path = tmp_path / f"sample_read_{filetag}.nc"
+        testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding)
+        from iris.tests.integration.netcdf.test_chararrays import ncdump
+
+        ncdump(tempfile_path)
+        yield testdata
+
+    def assert_no_load_problems(self):
+        if len(iris.loading.LOAD_PROBLEMS.problems):
+            probs = "\n".join(str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems)
+            assert probs == ""
+
+    def test_valid_encodings(self, encoding, testdata):
+        testfile_path, coordvar_strings, datavar_strings = testdata
+        cube = iris.load_cube(testfile_path)
+        self.assert_no_load_problems()
+        assert cube.shape == (N_XDIM,)
+
+        if encoding != "utf-32":
+            expected_string_width = N_CHARS_DIM
+        else:
+            expected_string_width = (N_CHARS_DIM // 4) - 1
+        assert cube.dtype == f"<U{expected_string_width}"
+        assert np.all(cube.data == datavar_strings)
+        coord_var = cube.coord("v_co")
+        assert coord_var.dtype == f"<U{expected_string_width}"
+        assert np.all(coord_var.points == coordvar_strings)