Skip to content

Commit c4b7936

Browse files
committed
First proper testing (reads working).
1 parent 94b2b21 commit c4b7936

File tree

2 files changed

+227
-33
lines changed

2 files changed

+227
-33
lines changed

lib/iris/fileformats/cf.py

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -811,39 +811,40 @@ def cf_label_data(self, cf_data_var):
811811
% self.cf_name
812812
)
813813

814-
label_data = self[:]
815-
816-
if ma.isMaskedArray(label_data):
817-
label_data = label_data.filled(b"\0")
818-
819-
# Determine whether we have a string-valued scalar label
820-
# i.e. a character variable that only has one dimension (the length of the string).
821-
if self.ndim == 1:
822-
label_string = b"".join(label_data).strip()
823-
label_string = label_string.decode("utf8")
824-
data = np.array([label_string])
825-
else:
826-
# Determine the index of the string dimension.
827-
str_dim = self.dimensions.index(str_dim_name)
828-
829-
# Calculate new label data shape (without string dimension) and create payload array.
830-
new_shape = tuple(
831-
dim_len for i, dim_len in enumerate(self.shape) if i != str_dim
832-
)
833-
string_basetype = "|U%d"
834-
string_dtype = string_basetype % self.shape[str_dim]
835-
data = np.empty(new_shape, dtype=string_dtype)
836-
837-
for index in np.ndindex(new_shape):
838-
# Create the slice for the label data.
839-
if str_dim == 0:
840-
label_index = (slice(None, None),) + index
841-
else:
842-
label_index = index + (slice(None, None),)
843-
844-
label_string = b"".join(label_data[label_index]).strip()
845-
label_string = label_string.decode("utf8")
846-
data[index] = label_string
814+
data = self[:]
815+
# label_data = self[:]
816+
#
817+
# if ma.isMaskedArray(label_data):
818+
# label_data = label_data.filled(b"\0")
819+
#
820+
# # Determine whether we have a string-valued scalar label
821+
# # i.e. a character variable that only has one dimension (the length of the string).
822+
# if self.ndim == 1:
823+
# label_string = b"".join(label_data).strip()
824+
# label_string = label_string.decode("utf8")
825+
# data = np.array([label_string])
826+
# else:
827+
# # Determine the index of the string dimension.
828+
# str_dim = self.dimensions.index(str_dim_name)
829+
#
830+
# # Calculate new label data shape (without string dimension) and create payload array.
831+
# new_shape = tuple(
832+
# dim_len for i, dim_len in enumerate(self.shape) if i != str_dim
833+
# )
834+
# string_basetype = "|U%d"
835+
# string_dtype = string_basetype % self.shape[str_dim]
836+
# data = np.empty(new_shape, dtype=string_dtype)
837+
#
838+
# for index in np.ndindex(new_shape):
839+
# # Create the slice for the label data.
840+
# if str_dim == 0:
841+
# label_index = (slice(None, None),) + index
842+
# else:
843+
# label_index = index + (slice(None, None),)
844+
#
845+
# label_string = b"".join(label_data[label_index]).strip()
846+
# label_string = label_string.decode("utf8")
847+
# data[index] = label_string
847848

848849
return data
849850

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the BSD license.
4+
# See LICENSE in the root of the repository for full licensing details.
5+
"""Integration tests for various uses of character/string arrays in netcdf file variables.
6+
7+
This covers both the loading and saving of variables which are the content of
8+
data-variables, auxiliary coordinates, ancillary variables and -possibly?- cell measures.
9+
"""
10+
11+
from pathlib import Path
12+
13+
import numpy as np
14+
import pytest
15+
16+
import iris
17+
from iris.fileformats.netcdf import _thread_safe_nc
18+
19+
N_XDIM = 3
20+
N_CHARS_DIM = 64
21+
COORD_ON_SEPARATE_DIM = True
22+
PERSIST_TESTFILES = "~/chararray_testfiles"
23+
24+
25+
NO_ENCODING_STR = "<noencoding>"
26+
TEST_ENCODINGS = [
27+
NO_ENCODING_STR,
28+
"ascii",
29+
"utf-8",
30+
# "iso8859-1", # a common one-byte-per-char "codepage" type
31+
# "utf-16",
32+
"utf-32",
33+
]
34+
35+
36+
#
37+
# Routines to convert between byte and string arrays.
38+
# Independently defined here, to avoid relying on any code we are testing.
39+
#
40+
def convert_strings_to_chararray(
41+
string_array_1d: np.ndarray, maxlen: int, encoding: str | None = None
42+
):
43+
# Note: this is limited to 1-D arrays of strings.
44+
# Could generalise that if needed, but for now this makes it simpler.
45+
if encoding is None:
46+
encoding = "ascii"
47+
bbytes = [text.encode(encoding) for text in string_array_1d]
48+
pad = b"\0" * maxlen
49+
bbytes = [(x + pad)[:maxlen] for x in bbytes]
50+
chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
51+
return chararray
52+
53+
54+
def convert_bytearray_to_strings(
55+
byte_array, encoding="utf-8", string_length: int | None = None
56+
):
57+
"""Convert bytes to strings.
58+
59+
N.B. for now at least, we assume the string dim is **always the last one**.
60+
"""
61+
bytes_shape = byte_array.shape
62+
var_shape = bytes_shape[:-1]
63+
if string_length is None:
64+
string_length = bytes_shape[-1]
65+
string_dtype = f"U{string_length}"
66+
result = np.empty(var_shape, dtype=string_dtype)
67+
for ndindex in np.ndindex(var_shape):
68+
element_bytes = byte_array[ndindex]
69+
bytes = b"".join([b if b else b"\0" for b in element_bytes])
70+
string = bytes.decode(encoding)
71+
result[ndindex] = string
72+
return result
73+
74+
75+
def make_testfile(testfile_path: Path, encoding_str: str):
76+
"""Create a test netcdf file.
77+
78+
Also returns content strings (unicode or ascii versions).
79+
"""
80+
if encoding_str == NO_ENCODING_STR:
81+
encoding = None
82+
else:
83+
encoding = encoding_str
84+
85+
data_is_ascii = encoding in (None, "ascii")
86+
87+
if data_is_ascii:
88+
coordvar_strings = ["mOnster", "London", "Amsterdam"]
89+
datavar_strings = ["bun", "Eclair", "sandwich"]
90+
else:
91+
coordvar_strings = ["Münster", "London", "Amsterdam"]
92+
datavar_strings = ["bun", "éclair", "sandwich"]
93+
94+
coordvar_bytearray = convert_strings_to_chararray(
95+
string_array_1d=coordvar_strings, maxlen=N_CHARS_DIM, encoding=encoding
96+
)
97+
datavar_bytearray = convert_strings_to_chararray(
98+
string_array_1d=datavar_strings, maxlen=N_CHARS_DIM, encoding=encoding
99+
)
100+
101+
ds = _thread_safe_nc.DatasetWrapper(testfile_path, "w")
102+
try:
103+
ds.createDimension("x", N_XDIM)
104+
ds.createDimension("nstr", N_CHARS_DIM)
105+
if COORD_ON_SEPARATE_DIM:
106+
ds.createDimension("nstr2", N_CHARS_DIM)
107+
v_xdim = ds.createVariable("x", int, dimensions=("x"))
108+
v_xdim[:] = np.arange(N_XDIM)
109+
110+
v_co = ds.createVariable(
111+
"v_co",
112+
"S1",
113+
dimensions=(
114+
"x",
115+
"nstr2" if COORD_ON_SEPARATE_DIM else "nstr",
116+
),
117+
)
118+
v_co[:] = coordvar_bytearray
119+
120+
if encoding is not None:
121+
v_co._Encoding = encoding
122+
123+
v_numeric = ds.createVariable(
124+
"v_numeric",
125+
float,
126+
dimensions=("x",),
127+
)
128+
v_numeric[:] = np.arange(N_XDIM)
129+
130+
v_datavar = ds.createVariable(
131+
"v",
132+
"S1",
133+
dimensions=(
134+
"x",
135+
"nstr",
136+
),
137+
)
138+
v_datavar[:] = datavar_bytearray
139+
140+
if encoding is not None:
141+
v_datavar._Encoding = encoding
142+
143+
v_datavar.coordinates = "v_co v_numeric"
144+
finally:
145+
ds.close()
146+
147+
return testfile_path, coordvar_strings, datavar_strings
148+
149+
150+
@pytest.fixture(params=TEST_ENCODINGS)
151+
def encoding(request):
152+
return request.param
153+
154+
155+
class TestReadEncodings:
156+
"""Test loading of testfiles with encoded string data."""
157+
158+
@pytest.fixture()
159+
def testdata(self, encoding, tmp_path):
160+
"""Create a suitable valid testfile, and return expected string content."""
161+
if PERSIST_TESTFILES:
162+
tmp_path = Path(PERSIST_TESTFILES).expanduser()
163+
if encoding == "<noencoding>":
164+
filetag = "noencoding"
165+
else:
166+
filetag = encoding
167+
tempfile_path = tmp_path / f"sample_read_{filetag}.nc"
168+
testdata = make_testfile(testfile_path=tempfile_path, encoding_str=encoding)
169+
from iris.tests.integration.netcdf.test_chararrays import ncdump
170+
171+
ncdump(tempfile_path)
172+
yield testdata
173+
174+
def assert_no_load_problems(self):
175+
if len(iris.loading.LOAD_PROBLEMS.problems):
176+
probs = "\n".join(str(prob) for prob in iris.loading.LOAD_PROBLEMS.problems)
177+
assert probs == ""
178+
179+
def test_valid_encodings(self, encoding, testdata):
180+
testfile_path, coordvar_strings, datavar_strings = testdata
181+
cube = iris.load_cube(testfile_path)
182+
self.assert_no_load_problems()
183+
assert cube.shape == (N_XDIM,)
184+
185+
if encoding != "utf-32":
186+
expected_string_width = N_CHARS_DIM
187+
else:
188+
expected_string_width = (N_CHARS_DIM // 4) - 1
189+
assert cube.dtype == f"<U{expected_string_width}"
190+
assert np.all(cube.data == datavar_strings)
191+
coord_var = cube.coord("v_co")
192+
assert coord_var.dtype == f"<U{expected_string_width}"
193+
assert np.all(coord_var.points == coordvar_strings)

0 commit comments

Comments
 (0)