From b28686ab67086af673059226d8c3643fe3182c63 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 14 Jan 2025 13:43:01 -0800 Subject: [PATCH 01/20] hypothesis draft --- apis/python/requirements_dev.txt | 1 + apis/python/tests/ht/__init__.py | 0 apis/python/tests/ht/_array_state_machine.py | 289 ++++++++ apis/python/tests/ht/_ht_test_config.py | 40 ++ apis/python/tests/ht/_ht_util.py | 631 ++++++++++++++++++ apis/python/tests/ht/_ledger.py | 221 ++++++ apis/python/tests/ht/conftest.py | 67 ++ apis/python/tests/ht/test_ht_dataframe.py | 517 ++++++++++++++ apis/python/tests/ht/test_ht_densendarray.py | 304 +++++++++ apis/python/tests/ht/test_ht_fastercsx.py | 454 +++++++++++++ apis/python/tests/ht/test_ht_indexer.py | 150 +++++ apis/python/tests/ht/test_ht_sparsendarray.py | 298 +++++++++ 12 files changed, 2972 insertions(+) create mode 100644 apis/python/tests/ht/__init__.py create mode 100644 apis/python/tests/ht/_array_state_machine.py create mode 100644 apis/python/tests/ht/_ht_test_config.py create mode 100644 apis/python/tests/ht/_ht_util.py create mode 100644 apis/python/tests/ht/_ledger.py create mode 100644 apis/python/tests/ht/conftest.py create mode 100644 apis/python/tests/ht/test_ht_dataframe.py create mode 100644 apis/python/tests/ht/test_ht_densendarray.py create mode 100644 apis/python/tests/ht/test_ht_fastercsx.py create mode 100644 apis/python/tests/ht/test_ht_indexer.py create mode 100644 apis/python/tests/ht/test_ht_sparsendarray.py diff --git a/apis/python/requirements_dev.txt b/apis/python/requirements_dev.txt index 8b3bfa2a4b..959beea239 100644 --- a/apis/python/requirements_dev.txt +++ b/apis/python/requirements_dev.txt @@ -5,3 +5,4 @@ ruff sparse typeguard==4.4.0 types-setuptools +hypothesis diff --git a/apis/python/tests/ht/__init__.py b/apis/python/tests/ht/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apis/python/tests/ht/_array_state_machine.py b/apis/python/tests/ht/_array_state_machine.py new file mode 100644 index 0000000000..e34ad982b1 --- /dev/null +++ b/apis/python/tests/ht/_array_state_machine.py @@ -0,0 +1,289 @@ +"""Hypothesis rule-based statemachine ABC for SOMAArray. + +Intended to be specialized for SparseNDArray, et al. +""" + +from __future__ import annotations + +import re +from abc import abstractmethod +from typing import Any, Literal, Protocol, TypeAlias + +import numpy as np +import pyarrow as pa +from hypothesis import strategies as st +from hypothesis.stateful import RuleBasedStateMachine, invariant, precondition, rule + +import tiledbsoma as soma + +from tests.ht._ht_test_config import HT_TEST_CONFIG + +SOMAArray: TypeAlias = soma.DataFrame | soma.SparseNDArray | soma.DenseNDArray + + +class SOMAArrayStateMachine(RuleBasedStateMachine): + """Abstract base class for a soma array Hypothesis state machine""" + + def __init__(self) -> None: + super().__init__() + self.context = soma.SOMATileDBContext() + self.closed: bool = True + self.mode: Literal["r", "w"] | None = None + self.A: SOMAArray | None = None + self.uri = self.TestCase.tmp_path_factory.mktemp( + f"{self.__class__.__name__}-" + ).as_posix() + self.metadata: dict[str, Any] = ( + {} + ) # XXX TODO: should be a ledger to allow for time travel + self.initial_metadata_keys: set[str] = set() + + def setup(self, A: SOMAArray) -> None: + assert isinstance(A, SOMAArray) + assert A.mode == "w" and not A.closed + self.A = A + self.create_timestamp_ms = self.A.tiledb_timestamp_ms + self.closed = self.A.closed + self.mode = self.A.mode + self.metadata = dict(self.A.metadata) + self.initial_metadata_keys = set(self.metadata) + + def teardown(self) -> None: + if self.A is not None: + if not self.closed: + self.A.close() + self.A = None + + super().teardown() + + @property + def is_initialized(self) -> bool: + return self.A is not None + + @abstractmethod + def _array_exists( + uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + ) -> bool: + pass + + @abstractmethod + def _array_open(self, mode: str) -> None: + pass + + def _open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + assert self.A.closed + tiledb_timestamp = None # TODO/XXX: no time-travel for now. FIXME + self._array_open(mode=mode, tiledb_timestamp=tiledb_timestamp) + assert self.A is not None + self.closed = False + self.mode = mode + + def _close(self) -> None: + assert not self.A.closed + self.A.close() + self.closed = True + self.mode = None + + @abstractmethod + def _reopen(self, mode: str) -> None: + pass + + ## + ## ---- Open/close state + ## + + @precondition(lambda self: self.is_initialized) + def check_exists(self) -> None: + assert self._array_exists(self.uri, self.context, None) + + @precondition(lambda self: self.is_initialized) + @invariant() + def check_mode(self) -> None: + assert self.closed or self.mode == self.A.mode + + @precondition(lambda self: self.is_initialized) + @invariant() + def check_closed(self) -> None: + assert self.closed == self.A.closed + + @precondition(lambda self: not self.closed) + @rule() + def close(self) -> None: + self._close() + + @precondition(lambda self: self.closed) + @rule(mode=st.sampled_from(["r", "w"])) + def open(self, mode: str) -> None: + # TODO: time travel + self._open(mode=mode) + + @precondition( + lambda self: not HT_TEST_CONFIG["sc-61123_workaround"] + ) # TODO: this entire rule disabled until sc-61123 fixed. + @precondition(lambda self: not self.closed) + @precondition( + lambda self: not HT_TEST_CONFIG["sc-61118_workaround"] or self.mode != "w" + ) # TODO - fails due to loss of metadata on reopen from w->r. See sc-61118. Remove when fixed. + @rule(mode=st.sampled_from(["r", "w"])) + def reopen(self, mode: str) -> None: + assert not self.A.closed + assert not self.closed + assert self.mode is not None + self.A = self.A.reopen( + mode, + tiledb_timestamp=None, # no time-travel for now + ) + self.mode = mode + assert self.A.mode == mode and not self.A.closed + + ## + ## --- metadata + ## + # TODO: sc-61092 causes SOMA to fail on writing a metadata value with a non-ASCII codepoint. + # TODO: due to sc-61093, zero length bytes and strings are mishandled (not written correctly). Remove the `min_size` when fixed. + # TODO: due to sc-61094, strings containing a zero code point also fail. + + METADATA_KEY_ALPHABET = ( + st.characters(codec="utf-8", exclude_characters=["\x00"]) + if HT_TEST_CONFIG["sc-61094_workaround"] + else st.characters(codec="utf-8") + ) + METADATA_KEYS = st.text(min_size=1, max_size=4096, alphabet=METADATA_KEY_ALPHABET) + + METADATA_VALUE_ALPHABET = ( + st.characters(codec="ascii", exclude_characters=["\x00"]) + if ( + HT_TEST_CONFIG["sc-61092_workaround"] + or HT_TEST_CONFIG["sc-61094_workaround"] + ) + else st.characters(codepoint="utf-8") + ) + METADATA_VALUES = st.one_of( + st.text( + alphabet=METADATA_VALUE_ALPHABET, + min_size=1 if HT_TEST_CONFIG["sc-61093_workaround"] else 0, + ) + | st.integers( + min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max + ) + | st.floats( + allow_nan=False + ) # FIXME: disabled NaNs make assertions easier (they are supported) + ) + + IGNORE_KEYS = re.compile(r"^soma_dim_[0-9]+_domain_(upper|lower)$") + + @precondition(lambda self: not self.closed) + @invariant() + def check_metadata(self) -> None: + # Prior to tiledbsoma 1.16, the "used domain" keys were still included. Ignore them. + # TODO: we could generalize this by removing _all_ keys that are reserved soma_* keys. + array_metadata = { + k: v for k, v in self.A.metadata.items() if not self.IGNORE_KEYS.match(k) + } + assert array_metadata == self.metadata + + @precondition( + lambda self: not self.closed and self.mode == "w" and len(self.metadata) < 100 + ) + @rule(k=METADATA_KEYS, v=METADATA_VALUES) + def set_metadata(self, k: str, v: str | int | float) -> None: + self.metadata[k] = v + self.A.metadata[k] = v + + @precondition( + lambda self: not self.closed + and self.mode == "w" + and len(self.metadata) > len(self.initial_metadata_keys) + ) + @precondition(lambda self: not self.closed) + @rule(data=st.data()) + def del_metadata(self, data: st.DataObject) -> None: + k = data.draw( + st.sampled_from( + [ + kn + for kn in self.metadata.keys() + if kn not in self.initial_metadata_keys + ] + ) + ) + del self.metadata[k] + del self.A.metadata[k] + + +class ShapesFactory(Protocol): + """Factory for a strategy returning ndarray shape.""" + + def __call__( + self, + *, + min_shape: tuple[int, ...] | None = None, + max_shape: tuple[int, ...] | None = None, + ) -> st.SearchStrategy[tuple[int | None, ...]]: ... + + +class SOMANDArrayStateMachine(SOMAArrayStateMachine): + """Abstract base class for NDArray Hypothesis state machine.""" + + def __init__(self, shapes_factory: ShapesFactory) -> None: + super().__init__() + self.shapes_factory = shapes_factory + + def setup(self, type, shape, array) -> None: + super().setup(array) + self.type = type + self.schema = pa.schema( + [ + pa.field(f"soma_dim_{n}", pa.int64(), nullable=False) + for n in range(len(shape)) + ] + + [pa.field("soma_data", self.type, nullable=False)] + ) + assert all((shape[i] or 1) == self.A.shape[i] for i in range(len(shape))) + assert self.schema == self.A.schema + self.shape = tuple( + (shape[i] or 1) for i in range(len(shape)) + ) # XXX TODO: shape should be a ledger + + ## + ## --- schema + ## + + @precondition(lambda self: not self.closed) + @invariant() + def check_schema(self) -> None: + schema = self.A.schema + assert len(schema.types) == len(self.shape) + 1 + assert schema.field("soma_data").type == self.type + for idx in range(len(self.shape)): + assert schema.names[idx] == f"soma_dim_{idx}" + assert schema.types[idx] == pa.int64() + assert schema.field(f"soma_dim_{idx}").type == pa.int64() + assert self.A.schema == self.schema + + ## + ## --- shape + ## + + @precondition(lambda self: not self.closed) + @invariant() + def check_shape(self) -> None: + assert hasattr(self.A, "shape") # sc-61123 + assert self.A.shape == tuple( + (s or 1) for s in self.shape + ), f"Unexpected shape in {self.A}: had {self.A.shape}, expected {self.shape}" + assert self.A.ndim == len(self.shape) + + @precondition(lambda self: self.closed or self.mode == "w") + @rule(data=st.data()) + def expand_shape(self, data: st.DataObject) -> None: + if self.closed: + self._open(mode="w") + assert self.mode == "w" + new_shape = data.draw( + self.shapes_factory(min_shape=self.shape, max_shape=self.A.maxshape) + ) + self.A.resize(new_shape) + self.shape = new_shape + self._close() # resize is committed upon close diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py new file mode 100644 index 0000000000..3250820a65 --- /dev/null +++ b/apis/python/tests/ht/_ht_test_config.py @@ -0,0 +1,40 @@ +"""Config settings for all Hypothesis tests. Primarily used to toggle bug work-arounds, etc. +""" + +HT_TEST_CONFIG = { + # + # Defect work-arounds, while awaiting a fix + # + # data corruption due to incorrect Arrow array offset handling + "sc-61239_workaround": True, + # creating array with timestamp==0 fails in 1.15 (regression) + "sc-61054_workaround": True, + # Tables returned by SparseNDArray.read have incorrect nullability in schema fields + "sc-61222_workaround": True, + # SparseNDArray.read returns table with type==int64 when array schema has type==timestamp[us] + "sc-61227_workaround": True, + # reopen return mangled object + "sc-61123_workaround": True, + # reopen w->r loses all metadata modifications + "sc-61118_workaround": True, + # metadata VALUES with non-ASCII codepoints generate an error + "sc-61092_workaround": True, + # Zero-length strings as a metadata value are stored incorrectly + "sc-61093_workaround": True, + # metadata keys with a zero codepoint are saved as empty string + "sc-61094_workaround": True, + # dataframe column names of \x00 silently mutated to empty Python string + "sc-61291_workaround": True, + # DataFrame.write creates 1+ fragments (one per table chunk) + "sc-61462_workaround": True, + # Core does not correctly de-dup 0. and -0. on float dimensions + "sc-61506_workaround": True, + # DenseNDArray can't read timestamps + "sc-61743_workaround": True, + # Read of new array returns incorrect info + "sc-61676_workaround": True, + # + # Enable/disable partially implemented features + # + "allow_nullable": False, +} diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py new file mode 100644 index 0000000000..07b22047e3 --- /dev/null +++ b/apis/python/tests/ht/_ht_util.py @@ -0,0 +1,631 @@ +"""Utilities for use with hypothesis -- mostly search strategies.""" + +from __future__ import annotations + +import datetime +from itertools import pairwise +from typing import Any, Mapping, Sequence + +import hypothesis.extra.numpy as ht_np +import numpy as np +import numpy.typing as npt +import pandas as pd +import pyarrow as pa +from hypothesis import strategies as st + +from tests.ht._ht_test_config import HT_TEST_CONFIG + +Shape = tuple[int, ...] +ArrowSlice = tuple[int, int] + + +def everything_except(excluded_types: type) -> st.SearchStrategy[type]: + """Create a strategy for all types exclusive of those specified. + + Example: + everything_except(int|float) + + """ + return ( + st.from_type(type) + .flatmap(st.from_type) + .filter(lambda x: not isinstance(x, excluded_types)) + ) + + +def resolve_dtype( + draw: st.DrawFn, + dtype: npt.DTypeLike | pa.DataType | st.SearchStrategy[npt.DTypeLike | pa.DataType], +) -> np.dtype: + """resolve the dtype argument to a numpy.dtype. Helper for search strategies.""" + if isinstance(dtype, st.SearchStrategy): + dtype = draw(dtype) + if isinstance(dtype, pa.DataType): + dtype = dtype.to_pandas_dtype() + dtype = np.dtype(dtype) + return dtype + + +def from_datatype( + datatype: pa.DataType, *args, **kwargs +) -> st.SearchStrategy[pa.Scalar]: + """Strategy to return an element of the given type.""" + if datatype in [pa.binary(), pa.large_binary()]: + return st.binary(*args, **kwargs).map(lambda v: pa.scalar(v, type=datatype)) + elif datatype in [pa.string(), pa.large_string()]: + return st.text(*args, **kwargs).map(lambda v: pa.scalar(v, type=datatype)) + elif datatype == pa.null(): + return st.none() + elif pa.types.is_timestamp(datatype): + allow_nan = kwargs.get("allow_nan", False) + + # NEP-7 defines the NaT value as integer -2**63 + min_value = pa.scalar(kwargs.get("min_value", -(2**63) + 1), type=datatype) + max_value = pa.scalar(kwargs.get("max_value", 2**63 - 1), type=datatype) + + elems = ( + st.integers(min_value.value, max_value.value) | st.none() + if allow_nan + else st.integers(min_value.value, max_value.value) + ) + return elems.map(lambda v: pa.scalar(v, type=datatype)) + else: + return ht_np.from_dtype( + np.dtype(datatype.to_pandas_dtype()), *args, **kwargs + ).map(lambda v: pa.scalar(v, type=datatype)) + + +def tiledb_timestamps(from_future: bool = False): + """Strategy which generates POSIX / TileDB timestamps, aka ints from 0 to now. + + NB: bug sc-61054 is triggered with timestamp==0, so generate only timestamps 0 pa.DataType: + return draw(st.sampled_from((pa.int8(), pa.int16(), pa.int32(), pa.int64()))) + + +@st.composite +def arrow_unsigned_integer_datatypes(draw: st.DrawFn) -> pa.DataType: + return draw(st.sampled_from((pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()))) + + +@st.composite +def arrow_floating_datatypes(draw: st.DrawFn) -> pa.DataType: + return draw(st.sampled_from((pa.float16(), pa.float32(), pa.float64()))) + + +@st.composite +def arrow_timestamp_datatypes(draw: st.DrawFn) -> pa.DataType: + return pa.timestamp( + unit=draw(st.sampled_from(("s", "ms", "us", "ns"))), + tz=draw(st.sampled_from((None, "UTC", "Europe/London"))), + ) + + +@st.composite +def arrow_datetime_datatypes(draw: st.DrawFn) -> pa.DataType: + return draw( + st.sampled_from( + ( + pa.time32(unit=draw(st.sampled_from(("s", "ms")))), + pa.time64(unit=draw(st.sampled_from(("us", "ns")))), + pa.date32(), + pa.date64(), + ) + ) + ) + + +@st.composite +def arrow_decimal_datatypes(draw: st.DrawFn) -> pa.DataType: + return draw( + st.sampled_from( + ( + pa.decimal128( + precision=draw(st.integers(min_value=1, max_value=38)), + scale=draw(st.integers(min_value=-(2**31), max_value=2**31 - 1)), + ), + pa.decimal256( + precision=draw(st.integers(min_value=1, max_value=76)), + scale=draw(st.integers(min_value=-(2**31), max_value=2**31 - 1)), + ), + ) + ) + ) + + +@st.composite +def arrow_datatypes(draw: st.DrawFn) -> pa.DataType: + return draw( + st.one_of( + arrow_integer_datatypes(), + arrow_unsigned_integer_datatypes(), + arrow_floating_datatypes(), + st.sampled_from( + ( + pa.null(), + pa.bool_(), + pa.binary(length=draw(st.integers(min_value=-1, max_value=1024))), + pa.string(), + pa.large_binary(), + pa.large_string(), + ) + ), + arrow_timestamp_datatypes(), + arrow_datetime_datatypes(), + arrow_decimal_datatypes(), + ) + ) + + +def ndarray_datatype() -> st.SearchStrategy: + return st.from_type(pa.DataType).filter( + lambda t: ( + pa.types.is_primitive(t) + and not (pa.types.is_timestamp(t) and t.tz is not None) + and not pa.types.is_time(t) + and not pa.types.is_date(t) + and t + not in [ + pa.float16(), + ] + ) + ) + + +@st.composite +def arrow_schema_field_name(draw: st.DrawFn) -> str: + # TileDB attribute names may not start with '__' + elements = st.text(min_size=1).filter(lambda n: not n.startswith("__")) + if HT_TEST_CONFIG["sc-61291_workaround"]: + elements = elements.filter(lambda n: "\x00" not in n) + return draw(elements) + + +@st.composite +def arrow_schema( + draw: st.DrawFn, + max_fields: int | None = None, + unique_field_names: bool = False, + required_fields: Sequence[pa.Field] = (), + elements: st.SearchStrategy[pa.DataType] | None = None, +) -> pa.Schema: + # A schema must have at least one index column and one attribute column + max_fields = 100 if max_fields is None else max_fields + assert max_fields > 1 + + fields = {f.name: f for f in required_fields} + if "soma_joinid" not in fields and draw(st.booleans()): + fields["soma_joinid"] = pa.field("soma_joinid", nullable=False, type=pa.int64()) + + elements = arrow_datatypes() if elements is None else elements + + # NB: no metadata in Arrow schema + n_fields = draw(st.integers(min_value=2, max_value=max_fields)) + for n in range(n_fields - len(fields)): + while True: + field_name = draw(arrow_schema_field_name()) + if not unique_field_names or field_name not in fields: + break + + field_type = draw(elements) + field_nullable = True if field_type == pa.null() else draw(st.booleans()) + fields[field_name] = pa.field( + field_name, nullable=field_nullable, type=field_type + ) + + return pa.schema(list(fields.values())) + + +@st.composite +def arrow_shape( + draw: st.DrawFn, + shape: ( + int | st.SearchStrategy[int] | Shape | st.SearchStrategy[Shape] | None + ) = None, +) -> Shape: + if isinstance(shape, st.SearchStrategy): + shape = draw(shape) + if shape is None: + shape = draw(st.integers(max_value=1024)) + if isinstance(shape, np.generic): + shape = shape.item() + if isinstance(shape, np.ndarray): + shape = tuple(shape.tolist()) + if isinstance(shape, int): + shape = (shape,) + if isinstance(shape, tuple) and len(shape) == 1 and shape[0] >= 0: + return shape + raise ValueError("Invalid shape argument - specify 1D shape.") + + +@st.composite +def arrow_slice(draw: st.DrawFn, size: int) -> ArrowSlice: + """Return (offset, length) suitable for Array.slice or ChunkedArray.slice.""" + if size <= 0: + return (0, 0) + offset = draw(st.integers(min_value=0, max_value=size - 1)) + length = draw(st.integers(min_value=0, max_value=size - offset - 1)) + return (offset, length) + + +def pad_array(arr: npt.NDArray[Any], draw: st.DrawFn) -> pa.Array: + """Strategy helper: add padding to one or both ends of the array. This tests for Arrow array "offset" handling.""" + + if HT_TEST_CONFIG.get("sc-61239_workaround", False): + return pa.array(arr) + + head = draw(st.integers(min_value=0, max_value=16)) + tail = draw(st.integers(min_value=0, max_value=16)) + if not bool(head or tail): + return pa.array(arr) + + padding = draw(st.from_type(arr.dtype.type)) + + shape = (arr.shape[0] + head + tail, *arr.shape[1:]) + padded_arr = np.empty_like(arr, shape=shape) + padded_arr[0:head] = padding + padded_arr[head : head + len(arr)] = arr + padded_arr[head + len(arr) :] = padding + return pa.array(padded_arr)[head : head + len(arr)] + + +@st.composite +def arrow_array( + draw: st.DrawFn, + dtype: npt.DTypeLike | pa.DataType | st.SearchStrategy[npt.DTypeLike | pa.DataType], + shape: int | st.SearchStrategy[int] | Shape | st.SearchStrategy[Shape], + *, + elements: st.SearchStrategy[Any] | Mapping[str, Any] | None = None, + fill: st.SearchStrategy[Any] | None = None, + unique: bool = False, + padding: bool = True, +) -> pa.Array: + """Wrapper around hypothesis.extra.numpy.arrays, which returns value as a PyArrow Array. + + NB: this is quite slow for large arrays. See arrow_array_fast for a limited, but faster variant. + This variant retained for flexibility (vs speed). + """ + dtype = resolve_dtype(draw, dtype) + shape = draw(arrow_shape(shape)) + + if ( + not HT_TEST_CONFIG["allow_nullable"] + and dtype.kind in ["m", "M"] + and elements is None + ): + # NaT gets turned into a nulled position by pyarrow.array + elements = {"allow_nan": False} + + nparr = draw( + ht_np.arrays( + dtype=dtype, shape=shape, unique=unique, elements=elements, fill=fill + ) + ) + arr = pad_array(nparr, draw) if padding else pa.array(nparr) + + # sanity check + assert HT_TEST_CONFIG["allow_nullable"] or not pa.compute.any(arr.is_null()).as_py() + + return arr + + +@st.composite +def arrow_array_fast( + draw: st.DrawFn, + dtype: npt.DTypeLike | pa.DataType | st.SearchStrategy[npt.DTypeLike | pa.DataType], + shape: int | st.SearchStrategy[int] | Shape | st.SearchStrategy[Shape], + *, + unique: bool = False, + padding: bool = True, + min_value: Any = None, + max_value: Any = None, +) -> pa.Array: + """Faster, but limited version of arrow_array search strategy. + + Only supports a subset of types, all positions randomly generated (no fill), + and no control over element values (e.g., no ``elements`` argument). Importantly + this means no NaN for floats, etc. + """ + + def gen_unique_floats( + rng: np.random.Generator, lo: float, hi: float, n: int + ) -> npt.NDArray[np.float64]: + out = np.empty(n) + needed = n + while needed != 0: + arr = rng.uniform(lo, hi, needed) + uniqs = np.setdiff1d(np.unique(arr), out[: n - needed]) + out[n - needed : n - needed + uniqs.size] = uniqs + needed -= uniqs.size + rng.shuffle(out) + return out + + dtype = resolve_dtype(draw, dtype) + + shape = draw(arrow_shape(shape)) + length = shape[0] + + rng = np.random.default_rng(seed=draw(st.integers(min_value=0))) + match dtype.kind: + case "f": + low = min_value if min_value is not None else -np.finfo(dtype).max / 2 + high = max_value if max_value is not None else np.finfo(dtype).max / 2 + if unique: + nparr = gen_unique_floats(rng, low, high, length).astype(dtype) + else: + nparr = rng.uniform(low, high=high, size=length).astype(dtype) + + case "i" | "u": + # RNG draws max of int64 + low = int(min_value) if min_value is not None else -np.iinfo(dtype).max + high = int(max_value) if max_value is not None else np.iinfo(dtype).max + if (high - low) < np.iinfo(np.int64).max: + if high > low: + nparr = rng.choice(high - low, size=length, replace=(not unique)) + else: + nparr = np.full(shape=shape, fill_value=low, dtype=dtype) + nparr += low + else: + nparr = rng.choice( + np.iinfo(np.int64).max, size=length, replace=(not unique) + ) + if min_value is not None: + nparr += low + else: + nparr -= np.iinfo(dtype).max // 2 + + nparr = nparr.astype(dtype) + + case "M": + # TODO: implement min_value/max_value + assert min_value is None and max_value is None + nparr = rng.choice( + np.iinfo(np.int64).max, size=length, replace=(not unique) + ) + nparr = nparr.astype(dtype) + + case "b": + # TODO: implement min_value/max_value + assert min_value is None and max_value is None + nparr = rng.choice([True, False], size=length, replace=(not unique)) + + case _: + raise TypeError(f"Unsupported dtype: {dtype}") + + return pad_array(nparr, draw) if padding else pa.array(nparr) + + +@st.composite +def arrow_chunked_array_fast( + draw: st.DrawFn, + dtype: npt.DTypeLike | pa.DataType | st.SearchStrategy[npt.DTypeLike | pa.DataType], + shape: int | st.SearchStrategy[int] | Shape | st.SearchStrategy[Shape], + *, + unique: bool = False, + padding: bool = True, + min_value: Any = None, + max_value: Any = None, + splits: int | Sequence[int] | st.SearchStrategy[Sequence[int]] | None = None, +) -> pa.ChunkedArray: + + shape = draw(arrow_shape(shape)) + length = shape[0] + + arr = draw( + arrow_array_fast( + dtype=dtype, + shape=length, + unique=unique, + padding=padding, + min_value=min_value, + max_value=max_value, + ) + ) + + # sometimes, we want multiple (separate) underlying arrays, just to mix things up and + # ensure we are not generating only contiguous buffers. + if draw(st.booleans()): + # split in half, and copy values of second half to entirely different memory location. + # Must round-trip through NumPy as PyArrow doesn't appear to have a "copy" operator + # available (except in very recent versions of the package). + # + first_half = arr[0 : len(arr) // 2] + second_half = arr[len(arr) // 2 :].to_numpy().copy() + if padding: + second_half = pad_array(second_half, draw) + arr = pa.chunked_array([first_half, second_half]) + + if splits is None: + splits = 0 + elif isinstance(splits, st.SearchStrategy): + splits = draw(splits) + if isinstance(splits, int): + splits = draw(splitss(n_splits=min(splits, len(arr)), max_value=len(arr))) + + return split_arrow_array(arr, splits) + + +@st.composite +def splitss( + draw: st.DrawFn, n_splits: int | st.SearchStrategy[int], max_value: int +) -> list[int]: + if n_splits == 0: + return [] + rng = np.random.default_rng(seed=draw(st.integers(min_value=0))) + splits = rng.choice(max_value, size=n_splits, replace=False) + splits.sort() + return splits.tolist() + + +def split_arrow_array( + arr: pa.Array | pa.ChunkedArray, splits: list[int] +) -> pa.ChunkedArray: + assert np.array_equal(np.unique(splits), splits), "splits not unique and sorted" + assert len(splits) == 0 or ( + splits[0] >= 0 and splits[-1] < len(arr) + ), "splits out of range" + + split_points = [0] + splits + [len(arr)] + arr_splits = [arr[st:sp] for st, sp in pairwise(split_points)] + return pa.chunked_array(arr_splits, type=arr.type) + + +@st.composite +def random_length_tuple( + draw, elements=st.integers(), min_length: int = 0, max_length: int = 10 +): + """Generates a tuple of random length with elements drawn from the provided strategy.""" + length = draw(st.integers(min_value=min_length, max_value=max_length)) + return tuple(draw(st.lists(elements, min_size=length, max_size=length))) + + +@st.composite +def contiguous_slices(draw: Any, size: int) -> slice: + """Generates slices that will select indices up to the supplied size, always with + stride of 1. + + Based on hypothesis.strategies.slices() + """ + if size == 0: + step = draw(st.sampled_from([None, 1])) + return slice(None, None, step) + + # For slices start is inclusive and stop is exclusive + start = draw(st.integers(0, size) | st.none()) + stop = draw(st.integers(0, size) | st.none()) + start, stop = (stop, start) if (start or 0) > (stop or 0) else (start, stop) + step = 1 + + if draw(st.booleans()) and start is not None: + start -= size + if draw(st.booleans()) and stop is not None: + stop -= size + if (not draw(st.booleans())) and step == 1: + step = None + + return slice(start, stop + 1 if stop is not None else stop, step) + + +def schemas_equal(s1: pa.Schema, s2: pa.Schema, ignore_field_order=False) -> bool: + """NB: assumes all field names are unique! Raises if not.""" + if not ignore_field_order: + return s1 == s2 + else: + if len(s2) != len(s1): + return False + return all( + s1.field(field_name) == s2.field(field_name) for field_name in s1.names + ) + + +def arrays_equal( + read: pa.ChunkedArray, expected: pa.ChunkedArray, *, equal_nan: bool = False +) -> bool: + """Zero copy test for array equality, optionally allowing NaN==NaN.""" + + # TODO: handle nullable arrays + + if (read.type != expected.type) or (len(read) != len(expected)): + return False + + if not pa.types.is_floating(expected.type): + return expected.equals(read) + + # Floating point path, to allow for NaN. Implemented with NumPy for convenience only + return all( + np.array_equal(r.to_numpy(), e.to_numpy(), equal_nan=equal_nan) + for r, e in zip(read.chunks, expected.chunks) + ) + + +def tables_equal( + read: pa.Table, expected: pa.Table, *, equal_nan: bool = False +) -> bool: + """Test for table equality, optionally allowing NaN==NaN.""" + + # TODO: handle nullable arrays + + read_schema = read.schema + expected_schema = expected.schema + + # checking field order and length up front simplifies code below + if [f.name for f in read_schema] != [f.name for f in expected_schema]: + return False + + if HT_TEST_CONFIG["sc-61222_workaround"]: + # because sc-61222, where read returns tables with nullable missing from + # the schema, we need to cast the table before comparing. + read_schema = pa.schema([f.with_nullable(False) for f in read_schema]) + expected_schema = pa.schema([f.with_nullable(False) for f in expected_schema]) + + if HT_TEST_CONFIG["sc-61227_workaround"]: + # because sc-61227, read returns `int64` when we expect `timestamp[us]` + for fidx, field in enumerate(expected.schema): + if ( + field.type == pa.timestamp("us") + and read_schema.field(fidx).type == pa.int64() + ): + read_schema = read_schema.set( + fidx, read_schema.field(fidx).with_type(field.type) + ) + + if (read_schema != expected_schema) or len(read) != len(expected): + return False + + expected = expected.cast(expected_schema) + read = read.cast(read_schema) + is_eq = all(arrays_equal(r, e, equal_nan=equal_nan) for r, e in zip(read, expected)) + return is_eq + + +def df_to_table(df: pd.DataFrame, *, schema: pa.Schema | None = None) -> pa.Table: + + # Table.from_pandas attempts to infer nulled values (e.g., NaN->null, NaT->null). + # We often do not want this behavior, so explicitly override it with `from_pandas=False` + # paramter of pa.array(). + + # NB: this doesn't work with NaT/timestamp64. We could pass a `mask` param to `to_numpy`, + # but NaT is such a strange beast, leaving as is for now. + + schema = pa.Schema.from_pandas(df).remove_metadata() if schema is None else schema + tbl = pa.Table.from_pydict( + {k: pa.array(v, from_pandas=False) for k, v in df.items()}, schema=schema + ) + return tbl + + +def posix_filename() -> st.SearchStrategy: + return st.text( + alphabet=st.characters( + codec="ascii", + categories=["Lu", "Ll", "Nd"], + include_characters=["_", "-", "."], + ), + min_size=1, + ).filter(lambda fn: fn not in [".", ".."]) diff --git a/apis/python/tests/ht/_ledger.py b/apis/python/tests/ht/_ledger.py new file mode 100644 index 0000000000..921ff6aa61 --- /dev/null +++ b/apis/python/tests/ht/_ledger.py @@ -0,0 +1,221 @@ +"""Ledger/log to model fragment/schema/metadata log entries.""" + +from __future__ import annotations + +import pathlib +import re +from abc import ABCMeta, abstractmethod +from typing import Generic, Sequence, TypeVar + +import numpy as np +import pandas as pd +import pyarrow as pa + +from tests.ht._ht_util import schemas_equal + + +def get_entries(path: str | pathlib.Path) -> set[str]: + """Get log entry names from directory, and return in canonical order. + + This is used to determine, by inspection, the names that TileDB Core + assigns to log entries, such as write fragments, metadata tiles, etc. + """ + dir = pathlib.Path(path) + children = [p.relative_to(dir).as_posix() for p in dir.iterdir()] + entries = [c for c in children if re.match(r"__[0-9]+_[0-9]+_[0-9a-fA-F]+", c)] + entries.sort() + return entries + + +LedgerEntryDataType = TypeVar("LedgerEntryDataType") + + +class LedgerEntry(Generic[LedgerEntryDataType], metaclass=ABCMeta): + """An abstract consistent unit of written data, such as a fragment.""" + + def __init__(self, timestamp_ms: int, name: str, data: LedgerEntryDataType) -> None: + self.timestamp_ms: int = timestamp_ms + self.name = name + self.data = data + + @abstractmethod + def consolidate_with( + self, other: LedgerEntry[LedgerEntryDataType], allow_duplicates: bool + ) -> LedgerEntry[LedgerEntryDataType]: + pass + + +class Ledger(Generic[LedgerEntryDataType]): + def __init__( + self, + initial_entry: LedgerEntry[LedgerEntryDataType], + *, + allows_duplicates: bool = False, + ) -> None: + self.entries: list[LedgerEntry[LedgerEntryDataType]] = [initial_entry] + self.initial_entry = ( + initial_entry # XXX: do we need this or can we use entries[0]? + ) + self.allows_duplicates = allows_duplicates + + # multiple fragments with same timestamp are trouble. Just disallow for now + # or we will have unstable tests. See sc-61223. When this is fixed, and there + # is a stable read order, we could in principle reproduce that order (assuming + # we know the UUID of each fragment, which requires sc-61226) + self.timestamps = set() + + def __repr__(self) -> str: + return ( + f"Ledger(n_entries={len(self.entries)}, " + + f"allows_duplicates={self.allows_duplicates}):\n" + + "\n".join(repr(f) for f in self.entries) + + "\n" + ) + + def read(self, timestamp_ms: int) -> LedgerEntry[LedgerEntryDataType]: + """Return a single ledger entry representing all writes <= timestamp""" + assert len(self.entries) > 0 + entries_to_consolidate = sorted( + filter(lambda f: f.timestamp_ms <= timestamp_ms, self.entries), + key=lambda f: (f.timestamp_ms, f.name), + ) + consolidated_result = entries_to_consolidate[0] + for entry in entries_to_consolidate[1:]: + consolidated_result = consolidated_result.consolidate_with( + entry, self.allows_duplicates + ) + return consolidated_result + + def write(self, entry: LedgerEntry[LedgerEntryDataType]) -> None: + """Write new entry to the ledger.""" + assert entry.timestamp_ms >= 0 + assert type(entry) is type(self.initial_entry) + + if entry.timestamp_ms in self.timestamps: + raise ValueError("Timestamp already written - may lead to unstable test.") + + self.entries.append(entry) + self.timestamps.add(entry.timestamp_ms) + + +class ArrowTableLedgerEntry(LedgerEntry[pa.Table]): + """Ledger entry based upon an Arrow Table.""" + + def __init__( + self, timestamp_ms: int, name: str, data: pa.Table, index_columns: Sequence[str] + ) -> None: + super().__init__(timestamp_ms, name, data) + self.index_columns: list[str] = list(index_columns) + + def __repr__(self) -> str: + return f"ArrowTableLedgerEntry(timestamp_ms={self.timestamp_ms}, index_columns={self.index_columns}):\n{self.data}" + + def consolidate_with( + self, other: ArrowTableLedgerEntry, allow_duplicates: bool + ) -> ArrowTableLedgerEntry: + + assert (self.timestamp_ms, self.name) < (other.timestamp_ms, other.name) + assert schemas_equal(self.data.schema, other.data.schema) + assert self.index_columns == other.index_columns + + earliest, latest = self, other + + if allow_duplicates: + combined_table = pa.concat_tables((earliest, latest)) + else: + if len(earliest.data) == 0: + combined_table = latest.data + elif len(latest.data) == 0: + combined_table = earliest.data + else: + schema = self.data.schema + latest_indexed = latest.to_pandas().set_index(self.index_columns) + earliest_indexed = earliest.to_pandas().set_index(self.index_columns) + + # Table.from_pandas attempts to infer nulled values (e.g., NaN->null, NaT->null). + # We do not want this behavior, so explicitly override it with `from_pandas=False` + combined_table = pa.Table.from_pydict( + { + k: pa.array(v, from_pandas=False) + for k, v in combine_first(latest_indexed, earliest_indexed) + .reset_index() + .items() + }, + schema=schema, + ) + + return ArrowTableLedgerEntry( + timestamp_ms=latest.timestamp_ms, + data=combined_table, + name="consolidated", + index_columns=self.index_columns, + ) + + def to_pandas(self) -> pd.DataFrame: + return self.data.to_pandas(ignore_metadata=True) + + def to_table(self) -> pa.Table: + return self.data + + +class ArrowTensorLedgerEntry(LedgerEntry[pa.Tensor]): + """Ledger entry based upon an Arrow Tensor.""" + + def __init__( + self, + timestamp_ms: int, + name: str, + data: pa.Tensor, + ) -> None: + super().__init__(timestamp_ms, name, data) + + def __repr__(self) -> str: + return f"ArrowTensorLedgerEntry(timestamp_ms={self.timestamp_ms}):\n{self.data}" + + def consolidate_with( + self, other: ArrowTensorLedgerEntry, allow_duplicates: bool + ) -> ArrowTensorLedgerEntry: + assert not allow_duplicates, "Unsupported" + assert (self.timestamp_ms, self.name) < (other.timestamp_ms, other.name) + return other + + def to_tensor(self) -> pa.Tensor: + return self.data + + def to_numpy(self) -> np.ndarray: + return self.data.to_numpy() + + +def combine_first(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame: + """Combine dataframes - similar to pandas.DataFrame.combine_first, + except fixes pandas#60128 and ignores NA values (they are copied as is). + + NB: the two dataframes MUST have the same structure, and we aren't + too careful about checking for that. + """ + + assert first.columns.equals(second.columns) + assert first.dtypes.equals(second.dtypes) + assert first.index.nlevels == second.index.nlevels + + new_index = first.index.union(second.index) + new_data = {} + for col in first.columns: + first_series = first[col] + second_series = second[col] + + keep_second_index = second.index.difference(first.index) + keep_first_index = first.index + + first_series = first_series.reindex(keep_first_index, copy=False) + second_series = second_series.reindex(keep_second_index, copy=False) + + if first_series.dtype.kind == "M" and second_series.dtype.kind == "M": + second_series = pd.to_datetime(second_series) + + combined_series = pd.concat([first_series, second_series]) + combined_series = combined_series.reindex(new_index, copy=False) + + new_data[col] = combined_series + + return pd.DataFrame(new_data, index=new_index) diff --git a/apis/python/tests/ht/conftest.py b/apis/python/tests/ht/conftest.py new file mode 100644 index 0000000000..6e26a7808e --- /dev/null +++ b/apis/python/tests/ht/conftest.py @@ -0,0 +1,67 @@ +from typing import Any + +import hypothesis as ht +import hypothesis.extra.numpy as ht_np +import hypothesis.strategies as st +import pyarrow as pa +import pytest + +import tiledbsoma as soma + +from tests.ht._ht_test_config import HT_TEST_CONFIG +from tests.ht._ht_util import ( + arrow_array_fast, + arrow_chunked_array_fast, + arrow_datatypes, + arrow_shape, +) + + +@pytest.fixture(scope="class") +def make_tmp_dir(request, tmp_path_factory) -> None: + """Set a class variable - useful for Hypothesis RuleBasedStateMachine test objects.""" + request.cls.tmp_path_factory = tmp_path_factory + + +@pytest.fixture +def ht_test_config() -> dict[str, Any]: + return HT_TEST_CONFIG + + +@pytest.fixture +def concurrency() -> int | None: + return None + + +@pytest.fixture +def context(concurrency: int | None) -> soma.SOMATileDBContext: + if concurrency is None: + return soma.SOMATileDBContext() + else: + return soma.SOMATileDBContext( + tiledb_config={"soma.compute_concurrency_level": f"{concurrency}"} + ) + + +# Register Hypothesis strategies for use with `strategies.from_type()` +st.register_type_strategy(pa.DataType, arrow_datatypes()) +st.register_type_strategy( + pa.Array, + arrow_array_fast( + dtype=ht_np.array_dtypes(), + shape=arrow_shape(shape=st.integers(min_value=0, max_value=2047)), + ), +) +st.register_type_strategy( + pa.ChunkedArray, + arrow_chunked_array_fast( + dtype=ht_np.array_dtypes(), + shape=arrow_shape(shape=st.integers(min_value=0, max_value=4095)), + ), +) +# TODO: vary context configuration? +st.register_type_strategy(soma.SOMATileDBContext, st.just(soma.SOMATileDBContext())) + + +# Register hypothesis profile for extensive/expensive test runs +ht.settings.register_profile("expensive", max_examples=10000) diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py new file mode 100644 index 0000000000..431e104882 --- /dev/null +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -0,0 +1,517 @@ +"""Hypothesis tests for SOMADataFrame.""" + +from __future__ import annotations + +from itertools import pairwise +from typing import Any, Sequence + +import numpy as np +import pyarrow as pa +import pytest +from hypothesis import strategies as st +from hypothesis.extra import numpy as ht_np +from hypothesis.extra import pandas as ht_pd +from hypothesis.stateful import initialize, invariant, precondition, rule + +import tiledbsoma as soma + +from tests.ht._array_state_machine import SOMAArrayStateMachine +from tests.ht._ht_test_config import HT_TEST_CONFIG +from tests.ht._ht_util import ( + arrow_schema, + df_to_table, + from_datatype, + pad_array, + schemas_equal, + splitss, + tables_equal, +) +from tests.ht._ledger import ArrowTableLedgerEntry, Ledger, get_entries + +# Only a subset of Arrow types are allowed as an indexed column (TileDB dimension) +DataFrameIndexTypes = [ + pa.int8(), + pa.uint8(), + pa.int16(), + pa.uint16(), + pa.int32(), + pa.uint32(), + pa.int64(), + pa.uint64(), + pa.float32(), + pa.float64(), + pa.binary(), + pa.large_binary(), + pa.string(), + pa.large_string(), + pa.timestamp("s"), + pa.timestamp("ms"), + pa.timestamp("us"), + pa.timestamp("ns"), +] + +AxisDomain = None | tuple[Any, Any] | list[Any] +Domain = Sequence[AxisDomain] + + +@st.composite +def dataframe_schema(draw: st.DrawFn) -> tuple[Sequence[str], pa.Schema]: + """Strategy will generate a legal DataFrame schema and accompanying index names. + + Will comply with SOMA/TileDB conventions: + * index columns must not be nullable + * schema field order must start with index colum names and be in same order + * must contain a `soma_joinid` column + * must have at least two columns, one indexed, one not indexed + """ + + # initial schema draw + schema = draw( + arrow_schema( + required_fields=(pa.field("soma_joinid", pa.int64(), nullable=False),), + unique_field_names=True, + elements=st.from_type(pa.DataType).filter( + lambda t: ( + pa.types.is_primitive(t) + and not (pa.types.is_timestamp(t) and t.tz is not None) + and not pa.types.is_time(t) + and not pa.types.is_date(t) + and t + not in [ + pa.float16(), + ] + ) + ), + ) + ) + assert len(schema) > 1 + + # randomly choose index columns + if draw(st.booleans()): + # common choice; treat as such + index_column_names = ("soma_joinid",) + else: + # find candidate fields to be indexed, and select random subset + candidate_index_fields = [ + f.name + for f in schema + if f.type in DataFrameIndexTypes + and not f.name.startswith( + "." + ) # Arrow compute functions choke on table columns beginning with '.' + ] + assert len(candidate_index_fields) > 0 # at least one index must exist + n_indices = draw( + st.integers( + min_value=1, max_value=min(len(candidate_index_fields), len(schema) - 1) + ) + ) + rng = np.random.default_rng(seed=draw(st.integers(min_value=0))) + index_column_names = tuple( + candidate_index_fields[i] + for i in rng.choice( + len(candidate_index_fields), size=n_indices, replace=False + ) + ) + # TileDB dimensions may not be nullable, so just rewrite those we have selected + for name in index_column_names: + idx = schema.get_field_index(name) + schema = schema.set(idx, schema.field(idx).with_nullable(False)) + + # reorder schema to match index_column_names for ease of read eq tests + reordered_fields = [schema.field(name) for name in index_column_names] + [ + f for f in schema if f.name not in index_column_names + ] + schema = pa.schema(reordered_fields) + + assert len(schema) > 1 + assert len(index_column_names) > 0 + assert len(index_column_names) < len(schema) + + return index_column_names, schema + + +def default_max_domain(datatype: pa.DataType) -> AxisDomain: + """Return the accepted default for the domain of a given Arrow DataType. + + NB: + * there are bugs that prescribe some values (noted inline), e.g. sc-61331 + """ + if datatype in [pa.string(), pa.large_string(), pa.binary(), pa.large_binary()]: + return ("", "") + if pa.types.is_floating(datatype): + dtype = datatype.to_pandas_dtype() + return (np.finfo(dtype).min, np.finfo(dtype).max) + if pa.types.is_integer(datatype): + dtype = datatype.to_pandas_dtype() + md = ( + np.iinfo(dtype).min, + np.iinfo(dtype).max - 2, # sc-61331 - can't use entire range(!). + ) + # Also, sc-61334, which has different limit for create() than change_domain(). + # Seemingly only affects int64. + if dtype in [np.int16, np.int32, np.int64, np.uint16, np.uint32, np.uint64]: + md = (md[0], md[1] - 2048) + + return md + if pa.types.is_timestamp(datatype): + # return Numpy! See sc-61328 and sc-61329 + return ( + np.datetime64( + -(2**63) + 1, datatype.unit + ), # NB: -2**63 is NaT, per NEP-7, and indices can't be nullable + np.datetime64( + 2**63 - 1_000_001, datatype.unit + ), # sc-61331: 1_000_001 appears to be a weird buggy magic number? + ) + + raise ValueError("Unsupported type.") + + +@st.composite +def dataframe_domain( + draw: st.DrawFn, + *, + schema: pa.Schema, + index_column_names: Sequence[str], + max_domain: Domain | None = None, + current_domain: Domain | None = None, +) -> Domain: + """Strategy to generate DataFrame domains. + + If current_domain specified, will never shrink. Will not exceed max_domain. + + NB: + * domain can't be set for string or binary index columns - use None or ('',''). + * domain can only expand. + * timestamp64 domain must be specified as a numpy.datetime64 (see sc-61328 and sc-61329) + * all other domain values must be native python types, not pyarrow.Scalar + """ + if max_domain is None: + max_domain = tuple( + default_max_domain(schema.field(n).type) for n in index_column_names + ) + assert len(index_column_names) == len(max_domain) + new_domain = [] + for field_index, field_name in enumerate(index_column_names): + field = schema.field(field_name) + if not pa.types.is_primitive(field.type): + new_domain.append(None) # i.e., noop, use default + else: + zero = ( + np.datetime64(0, field.type.unit) + if pa.types.is_timestamp(field.type) + else pa.scalar(0, type=field.type).as_py() + ) + max_lower, max_upper = max_domain[field_index] + if field_name == "soma_joinid": + max_lower = max(0, max_lower) # per SOMA spec + current_lower, current_upper = ( + current_domain[field_index] + if current_domain is not None + else (zero, zero) + ) + lower = ( + draw( + from_datatype( + field.type, + min_value=max_lower, + max_value=current_lower, + allow_nan=False, + ) + ) + if current_lower is None or draw(st.booleans()) + else current_lower + ) + upper = ( + draw( + from_datatype( + field.type, + min_value=current_upper, + max_value=max_upper, + allow_nan=False, + ) + ) + if current_upper is None or draw(st.booleans()) + else current_upper + ) + + # timestamp64 columns only accept np.datetime64 for domain (see sc-61328 and sc-61329) + # In addition, pa.TimestampScalar overflows in a variety of situations, so don't use it + # (e.g., `pa.scalar(-161650356352888167,type=pa.timestamp('s')).as_py()` ) + if pa.types.is_timestamp(field.type): + lower = ( + np.datetime64(lower.value, field.type.unit) + if isinstance(lower, pa.TimestampScalar) + else lower + ) + upper = ( + np.datetime64(upper.value, field.type.unit) + if isinstance(upper, pa.TimestampScalar) + else upper + ) + else: + lower = lower.as_py() if isinstance(lower, pa.Scalar) else lower + upper = upper.as_py() if isinstance(upper, pa.Scalar) else upper + + assert lower <= upper + assert max_lower <= lower <= current_lower + assert max_upper >= upper >= current_upper + new_domain.append((lower, upper)) + + assert len(new_domain) == len(index_column_names) + return tuple(new_domain) + + +@st.composite +def arrow_table( + draw: st.DrawFn, + schema: pa.Schema, + index_column_names: Sequence[str], + domain: Domain, + *, + min_size: int | None = None, +) -> pa.Table: + """Strategy to generate Arrow Tables which: + * match the schema + * have unique values in the index columns + * have values within the domain for the index columns + """ + index_domains = {k: v for k, v in zip(index_column_names, domain)} + columns = [] + for field in schema: + name = field.name + dtype = np.dtype(field.type.to_pandas_dtype()) + unique = name in index_column_names or name == "soma_joinid" + elements = None + + min_value, max_value = index_domains.get(name, (None, None)) + assert name in index_domains or (min_value is None and max_value is None) + + if pa.types.is_timestamp(field.type): + # don't generate NaT. ht_np.from_dtype doesn't obey min/max value + # params, so draw ints, and then convert. NEB-7 says NaT is -2**63. + min_value = ( + -(2**63) + 1 + if min_value is None + else max(-(2**63) + 1, int(min_value.astype(np.int64))) + ) + max_value = ( + 2**63 - 1 + if max_value is None + else min(2**63 - 1, int(max_value.astype(np.int64))) + ) + elements = st.builds( + dtype.type, + st.integers(min_value=min_value, max_value=max_value), + st.just(field.type.unit), + ) + + elif pa.types.is_primitive(field.type): + elements = ht_np.from_dtype(dtype, min_value=min_value, max_value=max_value) + # Array dimensions do not de-dup -0. and 0. as the same. Disable any generation + # of negative zero until this is resolved. NB: ledger de-dup treats them a equivalent + # per IEEE 754 semantics. + if HT_TEST_CONFIG["sc-61506_workaround"] and pa.types.is_floating( + field.type + ): + elements = elements.filter(lambda x: not (x == 0 and np.signbit(x))) + + # else, use default + + columns.append( + ht_pd.column(name=name, dtype=dtype, unique=unique, elements=elements) + ) + + df = draw( + ht_pd.data_frames(columns=columns, index=ht_pd.range_indexes(min_size=min_size)) + ) + assert min_size is None or len(df) >= min_size + tbl = df_to_table(df, schema=schema) + assert schemas_equal(schema, tbl.schema) + if len(tbl) == 0: + return tbl + + # split, sometimes + if ( + len(tbl) > 3 + and draw(st.booleans()) + and not HT_TEST_CONFIG["sc-61239_workaround"] + ): + n_splits = draw(st.integers(min_value=0, max_value=max(0, len(tbl) // 10))) + if n_splits > 0: + split_points = draw(splitss(n_splits=n_splits, max_value=len(tbl))) + split_points = [0] + split_points + [len(tbl)] + tbl = pa.concat_tables([tbl[st:sp] for st, sp in pairwise(split_points)]) + + # pad, sometimes + if draw(st.booleans()) and not HT_TEST_CONFIG["sc-61239_workaround"]: + batches = tbl.to_batches() + batch_to_pad = draw(st.integers(min_value=0, max_value=len(batches) - 1)) + batch_arrays = [ + pad_array(arr.to_numpy(zero_copy_only=(arr.type != pa.bool_())), draw) + for arr in batches[batch_to_pad].columns + ] + batches[batch_to_pad] = pa.RecordBatch.from_arrays( + batch_arrays, schema=tbl.schema + ) + tbl = pa.Table.from_batches(batches) + + return tbl + + +class SOMADataFrameStateMachine(SOMAArrayStateMachine): + + def __init__(self) -> None: + super().__init__() + + @initialize(data=st.data(), index_cols_and_schema=dataframe_schema()) + def setup( + self, + data: st.DataObject, + index_cols_and_schema: tuple[Sequence[str], pa.Schema], + ) -> None: + self.index_column_names, self.schema = index_cols_and_schema + self.domain = data.draw( # TODO XXX: should be a ledger + dataframe_domain( + schema=self.schema, index_column_names=self.index_column_names + ) + ) + super().setup( + soma.DataFrame.create( + self.uri, + schema=self.schema, + domain=self.domain, + index_column_names=self.index_column_names, + context=self.context, + tiledb_timestamp=None, # TODO: no time-travel for now + ) + ) + self.domain = self.A.domain + assert not self.A.closed + assert self.A.mode == "w" + assert schemas_equal(self.schema, self.A.schema, ignore_field_order=True) + + self.data_ledger = Ledger[ArrowTableLedgerEntry]( + initial_entry=ArrowTableLedgerEntry( + data=self.schema.empty_table(), + timestamp_ms=self.A.tiledb_timestamp_ms, + name="initial entry", + index_columns=self.index_column_names, + ), + allows_duplicates=False, + ) + + def _array_exists( + uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + ) -> bool: + return soma.DataFrame.exists( + uri, context=context, tiledb_timestamp=tiledb_timestamp + ) + + def _array_open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + self.A = soma.DataFrame.open( + self.uri, mode=mode, context=self.context, tiledb_timestamp=tiledb_timestamp + ) + + ## + ## --- schema + ## + + @precondition(lambda self: not self.closed) + @invariant() + def check_schema(self) -> None: + assert isinstance(self.A, soma.DataFrame) + assert self.A.soma_type == "SOMADataFrame" + assert schemas_equal(self.schema, self.A.schema, ignore_field_order=True) + assert sorted(self.schema.names) == sorted(self.A.keys()) + assert self.index_column_names == self.A.index_column_names + + ## + ## --- domain + ## + + @precondition(lambda self: not self.closed) + @invariant() + def check_domain(self) -> None: + assert ( + self.A.domain == self.domain + ), f"Unexpected domain in {self.A}: had {self.A.domain}, expected {self.domain}" + + @precondition(lambda self: self.closed or self.mode == "w") + @rule(data=st.data()) + def expand_domain(self, data: st.DataObject) -> None: + assert self.index_column_names == self.A.index_column_names + new_domain = data.draw( + dataframe_domain( + schema=self.schema, + index_column_names=self.index_column_names, + current_domain=self.domain, + max_domain=self.A.maxdomain, + ) + ) + if self.closed: + self._open(mode="w") + assert self.mode == "w" + + self.A.change_domain(new_domain) + self.domain = new_domain # TODO XXX should be a ledger + self._close() # domain is committed upon close + + ## + ## --- data + ## + + @precondition(lambda self: not self.closed and self.mode == "r") + @invariant() + def check_read_all(self) -> None: + timestamp_ms = self.A.tiledb_timestamp_ms + sort_order = [(name, "ascending") for name in self.index_column_names] + expected = ( + self.data_ledger.read(timestamp_ms=timestamp_ms) + .to_table() + .sort_by(sort_order) + ) + found = self.A.read().concat().sort_by(sort_order) + assert tables_equal( + found, expected, equal_nan=True + ), f"{found}\n is not equal to {expected}" + + @precondition(lambda self: not self.closed and self.mode == "r") + @invariant() + def check_count(self) -> None: + expected = len( + self.data_ledger.read(timestamp_ms=self.A.tiledb_timestamp_ms).to_table() + ) + assert expected == self.A.count, "count mismatch" + + @precondition(lambda self: not self.closed and self.mode == "w") + @precondition( + lambda self: self.A.tiledb_timestamp_ms not in self.data_ledger.timestamps + ) # only one write per timestamp until sc-61223 and sc-61226 are fixed + @rule(data=st.data()) + def write(self, data: st.DataObject) -> None: + df_tbl = data.draw( + arrow_table(self.schema, self.index_column_names, self.domain, min_size=1) + ) + fragments_before_write = get_entries(f"{self.uri}/__fragments") + self.A.write(df_tbl) + new_fragments = set(get_entries(f"{self.uri}/__fragments")) - set( + fragments_before_write + ) + assert len(new_fragments) == ( + 1 if not HT_TEST_CONFIG["sc-61462_workaround"] else df_tbl[0].num_chunks + ) + self.data_ledger.write( + ArrowTableLedgerEntry( + timestamp_ms=self.A.tiledb_timestamp_ms, + name=new_fragments.pop(), + data=df_tbl, + index_columns=self.index_column_names, + ) + ) + + +TestSOMADataFrame = pytest.mark.usefixtures("make_tmp_dir")( + SOMADataFrameStateMachine.TestCase +) diff --git a/apis/python/tests/ht/test_ht_densendarray.py b/apis/python/tests/ht/test_ht_densendarray.py new file mode 100644 index 0000000000..b4e7c52416 --- /dev/null +++ b/apis/python/tests/ht/test_ht_densendarray.py @@ -0,0 +1,304 @@ +"""Hypothesis tests for DenseNDArray.""" + +from __future__ import annotations + +from typing import Any + +import hypothesis as ht +import hypothesis.extra.numpy as ht_np +import numpy as np +import pyarrow as pa +import pytest +from hypothesis import strategies as st +from hypothesis.stateful import ( + initialize, + invariant, + precondition, + rule, +) + +import tiledbsoma +import tiledbsoma as soma +import tiledbsoma._sparse_nd_array + +from tests.ht._array_state_machine import SOMANDArrayStateMachine +from tests.ht._ht_test_config import HT_TEST_CONFIG +from tests.ht._ht_util import ndarray_datatype +from tests.ht._ledger import ArrowTensorLedgerEntry, Ledger, get_entries + + +@st.composite +def dense_array_shape( + draw: st.DrawFn, + *, + min_shape: tuple[int, ...] | None = None, + max_shape: tuple[int, ...] | None = None, +) -> tuple[int | None, ...]: + """Strategy to generate nd array shapes.""" + + MAX_DIMS = 3 + if min_shape is not None: + ndim = len(min_shape) + elif max_shape is not None: + ndim = len(max_shape) + else: + ndim = draw(st.integers(min_value=1, max_value=MAX_DIMS)) + + min_values = [1] * ndim if min_shape is None else min_shape + # to keep the array under some number of elements max, use the nth root as max per-dim size + MAX_ELEM = 2**20 # 1M + shape_limit = int(MAX_ELEM ** (1 / ndim)) + if max_shape is None: + max_values = [shape_limit] * ndim + else: + max_values = [min(shape_limit, s) for s in max_shape] + + elements = [ + draw( + st.integers(min_value=min_values[i], max_value=max_values[i]) + if min_values[i] < max_values[i] + else st.just(min_values[i]) + ) + for i in range(ndim) + ] + new_shape = tuple(elements) + + if min_shape is not None: + assert len(new_shape) == len(min_shape) + assert new_shape >= tuple((s or 1) for s in min_shape) + + return new_shape + + +@st.composite +def dense_indices(draw: st.DrawFn, shape: tuple[int, ...]) -> tuple[int | slice]: + """Strategy to return DenseNDArray slicing, which currently allows: + * None - synonym for slice(None) + * slice - with step == 1 ONLY + * int - a single integer coord + """ + + def one_dim(s: int) -> int | slice: + if draw(st.booleans()): + return draw(st.integers(min_value=0, max_value=s)) + else: + element = st.integers(min_value=0, max_value=s) + a, b = draw(element), draw(element) + a, b = (a, b) if a <= b else (b, a) + if a == 0 and b == s and draw(st.booleans()): + return slice(None) + return slice(a, b, None) + + return tuple(one_dim(s) for s in shape) + + +DEFAULT_FILL_VALUE = { + pa.int8(): -127, + pa.int16(): -32767, + pa.int32(): -2147483647, + pa.int64(): -9223372036854775807, + pa.uint8(): 2**8 - 1, + pa.uint16(): 2**16 - 1, + pa.uint32(): 2**32 - 1, + pa.uint64(): 2**64 - 1, + pa.float32(): np.nan, + pa.float64(): np.nan, + pa.bool_(): False, + pa.timestamp("s"): "NaT", + pa.timestamp("ms"): "NaT", + pa.timestamp("us"): "NaT", + pa.timestamp("ns"): "NaT", +} + + +def fill_value_for_type(type: pa.DataType) -> Any: + if type not in DEFAULT_FILL_VALUE: + raise ValueError("Unsupported type (do not know default fill)") + + return DEFAULT_FILL_VALUE[type] + + +def densendarray_datatype() -> ht.SearchStrategy[pa.DataType]: + # Arrow Tensor doesn't support bool_ or timestamp, and that is the only + # read accessor we have. So for now, don't test those types. + if HT_TEST_CONFIG["sc-61743_workaround"]: + return ndarray_datatype().filter( + lambda t: t != pa.bool_() and not pa.types.is_timestamp(t) + ) + + return ndarray_datatype() + + +class SOMADenseNDArrayStateMachine(SOMANDArrayStateMachine): + + def __init__(self) -> None: + super().__init__(shapes_factory=dense_array_shape) + + @initialize(type=densendarray_datatype(), shape=dense_array_shape()) + def setup(self, type, shape) -> None: + super().setup( + type, + shape, + soma.DenseNDArray.create( + self.uri, + type=type, + shape=shape, + context=self.context, + tiledb_timestamp=None, # TODO: no time-travel for now + ), + ) + + # Initial state of dense ndarray should be completely filled with the + # default TilDB fill value + initial_array_state = pa.Tensor.from_numpy( + np.full( + self.shape, + fill_value_for_type(self.type), + dtype=self.type.to_pandas_dtype(), + ) + ) + assert initial_array_state.shape == self.shape + assert initial_array_state.type == self.type + + # TODO: due to sc-61676, reads return incorrect results for any portion + # of the array that has not be explicitly written. Hack around by explicitly + # writing fill values, AND disabling any resize operations. + if HT_TEST_CONFIG["sc-61676_workaround"]: + self.A.write(tuple(slice(0, n) for n in self.shape), initial_array_state) + self._close() + + self.data_ledger = Ledger[ArrowTensorLedgerEntry]( + initial_entry=ArrowTensorLedgerEntry( + data=initial_array_state, + timestamp_ms=self.A.tiledb_timestamp_ms, + name="initial entry", + ), + allows_duplicates=False, + ) + + def _array_exists( + uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + ) -> bool: + return soma.DenseNDArray.exists( + uri, context=context, tiledb_timestamp=tiledb_timestamp + ) + + def _array_open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + self.A = soma.DenseNDArray.open( + self.uri, mode=mode, context=self.context, tiledb_timestamp=tiledb_timestamp + ) + + ## + ## --- schema + ## + @precondition(lambda self: not self.closed) + @invariant() + def check_pytypes(self) -> None: + assert isinstance(self.A, soma.DenseNDArray) + assert self.A.soma_type == "SOMADenseNDArray" + assert not self.A.is_sparse + + # XXX temporarily override this so we can disable any reshapes (sc-61676). + # If we allow reshapes, read/write tests fail due to the bug. + # TODO: remove this code (let base class do its thing) when this bug is fixed. + @precondition(HT_TEST_CONFIG["sc-61676_workaround"]) + def expand_shape(self, data: st.DataObject) -> None: + return + + ## + ## --- data + ## + @precondition(lambda self: not self.closed and self.mode == "r") + # sc-61920 -- while the API accepts `auto` as a result_order, the read result + # is then nondeterministic. For now, don't do `auto` + @rule(result_order=st.sampled_from(["row-major", "column-major"])) + def check_read_all(self, result_order: str) -> None: + tensor = self.A.read(result_order=result_order) + expected = self.data_ledger.read( + timestamp_ms=self.A.tiledb_timestamp_ms + ).to_tensor() + if result_order != "row-major": + expected = pa.Tensor.from_numpy(expected.to_numpy().T) + + assert self.type == tensor.type == expected.type + assert tensor.shape == expected.shape + if result_order != "row-major": + assert tuple(reversed(tensor.shape)) == self.shape + else: + assert tensor.shape == self.shape + assert np.array_equal(tensor.to_numpy(), expected.to_numpy(), equal_nan=True) + + @precondition(lambda self: not self.closed and self.mode == "r") + @rule(data=st.data()) + def check_read_indexed(self, data: st.DataObject) -> None: + inclusive_shape = tuple(s - 1 for s in self.shape) + coords = data.draw(dense_indices(inclusive_shape)) + tensor = self.A.read(coords=coords).to_numpy() + assert self.type.to_pandas_dtype() == tensor.dtype + + subslc = tuple( + ( + slice(c.start, c.stop + 1 if c.stop is not None else None) + if isinstance(c, slice) + else slice(c, c + 1) + ) + for c in coords + ) + expected = self.data_ledger.read( + timestamp_ms=self.A.tiledb_timestamp_ms + ).to_numpy()[subslc] + assert tensor.shape == expected.shape + assert tensor.dtype == expected.dtype + assert np.array_equal(tensor, expected, equal_nan=True) + + @precondition(lambda self: not self.closed and self.mode == "w") + @precondition( + lambda self: self.A.tiledb_timestamp_ms not in self.data_ledger.timestamps + ) # only one write per timestamp until sc-61223 and sc-61226 are fixed + @rule(data=st.data()) + def write(self, data: st.DataObject) -> None: + + # draw sub-array + ndim = len(self.shape) + first = tuple( + data.draw(st.integers(min_value=0, max_value=s - 1)) for s in self.shape + ) + second = tuple( + data.draw(st.integers(min_value=0, max_value=s - 1)) for s in self.shape + ) + top_left = tuple(min(first[i], second[i]) for i in range(ndim)) + bot_right = tuple(max(first[i], second[i]) + 1 for i in range(ndim)) + coords = tuple(slice(top_left[i], bot_right[i]) for i in range(ndim)) + subarray = data.draw( + ht_np.arrays( + self.type.to_pandas_dtype(), + shape=tuple(bot_right[i] - top_left[i] for i in range(ndim)), + ) + ) + + # Write sub-array to the SOMA array + fragments_before_write = get_entries(f"{self.uri}/__fragments") + self.A.write(coords, pa.Tensor.from_numpy(subarray)) + new_fragments = set(get_entries(f"{self.uri}/__fragments")) - set( + fragments_before_write + ) + assert len(new_fragments) == 1 + + # Save write in the ledger. The tensor ledger expects the "entire" array value, + # not a differential value (i.e., it currently does not consolidate). + merged_array = self.data_ledger.read( + timestamp_ms=self.A.tiledb_timestamp_ms + ).to_numpy() + merged_array[coords] = subarray + self.data_ledger.write( + ArrowTensorLedgerEntry( + timestamp_ms=self.A.tiledb_timestamp_ms, + name=new_fragments.pop(), + data=pa.Tensor.from_numpy(merged_array), + ) + ) + + +TestSOMADenseNDArray = pytest.mark.usefixtures("make_tmp_dir")( + SOMADenseNDArrayStateMachine.TestCase +) diff --git a/apis/python/tests/ht/test_ht_fastercsx.py b/apis/python/tests/ht/test_ht_fastercsx.py new file mode 100644 index 0000000000..03d80439c2 --- /dev/null +++ b/apis/python/tests/ht/test_ht_fastercsx.py @@ -0,0 +1,454 @@ +"""Hypothesis tests for fastercsx module.""" + +from typing import Any, Literal, TypeAlias + +import numpy as np +import numpy.typing as npt +import pyarrow as pa +import pytest +import scipy.sparse as sparse +from hypothesis import given, settings +from hypothesis import strategies as st + +import tiledbsoma as soma +import tiledbsoma._fastercsx as fastercsx +import tiledbsoma.pytiledbsoma.fastercsx as clib_fastercsx + +from tests.ht._ht_util import ( + arrow_array, + arrow_array_fast, + contiguous_slices, + random_length_tuple, + resolve_dtype, + split_arrow_array, + splitss, +) + +# Supported types, i.e., these should work fine +CooIndexTypes = sorted(np.dtype(t) for t in (np.int32, np.int64)) +CsxIndexTypes = sorted(np.dtype(t) for t in (np.int32, np.int64, np.uint16, np.uint32)) +ValueTypes = sorted( + np.dtype(t) + for t in ( + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float32, + np.float64, + ) +) + +NDArrayIndex: TypeAlias = npt.NDArray[np.integer[Any]] +NDArrayNumber: TypeAlias = npt.NDArray[np.integer[Any] | np.floating[Any]] + + +def limit_value_range_element_strategy( + dtype: np.dtype, divisor: float +) -> dict[str, Any] | None: + if dtype.kind == "f": + info = np.finfo(dtype) + return {"min_value": -info.max / divisor, "max_value": info.max / divisor} + if dtype.kind in ["i", "u"]: + info = np.iinfo(dtype) + return {"min_value": info.min // divisor, "max_value": info.max // divisor} + return None + + +@st.composite +def coo_ijd( + draw: st.DrawFn, + dtype: npt.DTypeLike | pa.DataType | st.SearchStrategy[npt.DTypeLike | pa.DataType], + shape: tuple[int, int] | st.SearchStrategy[tuple[int, int]], + *, + density: float | st.SearchStrategy[float] = 0.01, + unique: bool = False, +) -> tuple[ + tuple[npt.NDArray[Any], ...], + tuple[npt.NDArray[Any], ...], + tuple[npt.NDArray[Any], ...], +]: + + dtype = resolve_dtype(draw, dtype) + shape = draw(shape) if isinstance(shape, st.SearchStrategy) else shape + assert isinstance(shape, tuple) and len(shape) == 2 + + density = draw(density) if isinstance(density, st.SearchStrategy) else density + assert isinstance(density, float) and 0 < density <= 1 + + nnz = int(shape[0] * shape[1] * density) + coord_dtype = draw(st.sampled_from(CooIndexTypes)) + + """ + if not unique, we need to be cognizant of the potential to overflow + when duplicates are summed (the default behavior for `to_scipy`). This + can easily cause some types to overflow, and others to lose precision, + which makes equality comparisons tricky. + + To avoid this, ONLY when `not unique`, constrain the range of generated + values to a very limited range (currently 1/128th of the full range). + This is extremely unlikely to overflow as it would require 128 identical + coordinates to be drawn. + + In the case of `unique`, draw from the full range for the type. + + Currently, the only edge case that fails to do the right thing is timestamp + generation (datetime64), as the underlying search strategy used does not + obey min_value/max_value for that type. TODO - FIXME. + """ + if not unique: + i = draw( + arrow_array_fast( + dtype=coord_dtype, + shape=nnz, + min_value=0, + max_value=shape[0] - 1, + ) + ) + j = draw( + arrow_array_fast( + dtype=coord_dtype, + shape=nnz, + min_value=0, + max_value=shape[1] - 1, + ) + ) + d = draw( + arrow_array( + dtype=dtype, + shape=nnz, + elements=limit_value_range_element_strategy(dtype, 128), + ) + ) + + else: + # draw unique points, then split into I/J + rng = np.random.default_rng(seed=draw(st.integers(min_value=0))) + points = rng.choice(shape[0] * shape[1], size=nnz, replace=False) + i, j = np.divmod(points, shape[1]) + i = pa.array(i, type=pa.from_numpy_dtype(coord_dtype)) + j = pa.array(j, type=pa.from_numpy_dtype(coord_dtype)) + d = draw(arrow_array(dtype=dtype, shape=nnz)) + + if draw(st.booleans()): + return (i.to_numpy(),), (j.to_numpy(),), (d.to_numpy(),) + + # else split into a chunked array + n_splits = draw(st.integers(min_value=0, max_value=max(0, len(d) // 10))) + split_points = draw(splitss(n_splits=n_splits, max_value=len(d))) + return ( + tuple(c.to_numpy() for c in split_arrow_array(i, split_points).chunks), + tuple(c.to_numpy() for c in split_arrow_array(j, split_points).chunks), + tuple(c.to_numpy() for c in split_arrow_array(d, split_points).chunks), + ) + + +@given( + do=st.data(), + value_dtype=st.sampled_from(ValueTypes), + unique=st.booleans(), + shape=st.tuples( + st.integers(min_value=0, max_value=1024), + st.integers(min_value=0, max_value=1024), + ), + context=st.from_type(soma.SOMATileDBContext), +) +@settings(max_examples=500) +def test_fastercsx_clib_compress_coo( + do: st.DataObject, + value_dtype: np.dtype, + unique: bool, + shape: tuple[int, int], + context: soma.SOMATileDBContext, +) -> None: + i, j, d = do.draw(coo_ijd(dtype=value_dtype, shape=shape, unique=unique)) + nnz = sum(len(c) for c in i) + assert nnz <= np.prod(shape) + index_dtype = do.draw( + st.sampled_from([t for t in CsxIndexTypes if np.iinfo(t).max >= nnz]) + ) + + indptr = np.empty(shape[0] + 1, dtype=index_dtype) + indices = np.empty(nnz, dtype=index_dtype) + data = np.empty(nnz, dtype=value_dtype) + clib_fastercsx.compress_coo( + context.native_context, shape, i, j, d, indptr, indices, data + ) + + # check with the oracle. Be careful if dups allowed, as summing dups + # in floats will be _approximately_ equal, not exactly equal + csr = sparse.csr_matrix( + (data, indices, indptr), shape=shape, dtype=value_dtype, copy=False + ) + if not unique: + csr.sum_duplicates() + csr.sort_indices() + + scipy_csr = sparse.csr_matrix( + (np.concatenate(d), (np.concatenate(i), np.concatenate(j))), + shape=shape, + dtype=value_dtype, + ) + + assert np.array_equal(csr.indptr, scipy_csr.indptr) + assert np.array_equal(csr.indices, scipy_csr.indices) + + # XXX the non-unique case has several issues: + # - dups are added, which can overflow + # - dups are added, which for floats may not be exactly eq in some situations + # XXX cleanup debug code + if not ( + np.allclose( + csr.data, + scipy_csr.data, + equal_nan=True if value_dtype.kind == "f" else False, + ) + if not unique + else np.array_equal( + csr.data, + scipy_csr.data, + equal_nan=True if value_dtype.kind == "f" else False, + ) + ): + print(csr.data.dtype, scipy_csr.data.dtype) + print(csr.data) + print(scipy_csr.data) + print(csr.data - scipy_csr.data) + assert ( + np.allclose( + csr.data, + scipy_csr.data, + equal_nan=True if value_dtype.kind == "f" else False, + ) + if not unique + else np.array_equal( + csr.data, + scipy_csr.data, + equal_nan=True if value_dtype.kind == "f" else False, + ) + ) + + +@given( + shape=random_length_tuple(elements=st.integers(), max_length=3), + i=random_length_tuple(st.from_type(npt.NDArray[Any]), max_length=4), + j=random_length_tuple(elements=st.from_type(npt.NDArray[Any]), max_length=4), + d=random_length_tuple(elements=st.from_type(npt.NDArray[Any]), max_length=4), + indptr=st.from_type(npt.NDArray[Any]), + indices=st.from_type(npt.NDArray[Any]), + data=st.from_type(npt.NDArray[Any]), + context=st.from_type(soma.SOMATileDBContext), +) +def test_fuzz_fastercsx_clib_compress_coo( + shape, + i: tuple[npt.NDArray[Any], ...], + j: tuple[npt.NDArray[Any], ...], + d: tuple[npt.NDArray[Any], ...], + indptr: npt.NDArray[Any], + indices: npt.NDArray[Any], + data: npt.NDArray[Any], + context: soma.SOMATileDBContext, +) -> None: + # TODO: exclude the rare case that would pass + with pytest.raises(Exception): + clib_fastercsx.compress_coo( + context.native_context, shape, i, j, d, indptr, indices, data + ) + + +@given( + indptr=st.from_type(npt.NDArray[Any]).filter( + lambda a: a.dtype not in CsxIndexTypes + ), + indices=st.from_type(npt.NDArray[Any]).filter( + lambda a: a.dtype not in CsxIndexTypes + ), + data=st.from_type(npt.NDArray[Any]).filter(lambda a: a.dtype not in ValueTypes), + context=st.from_type(soma.SOMATileDBContext), +) +@settings(max_examples=250) +def test_fuzz_fastercsx_clib_sort_csx_indices( + indptr: npt.NDArray[Any], + indices: npt.NDArray[Any], + data: npt.NDArray[Any], + context: soma.SOMATileDBContext, +) -> None: + # TODO: exclude the rare case that would pass + with pytest.raises(Exception): + clib_fastercsx.sort_csx_indices(context.native_context, indptr, indices, data) + + +@given( + major_idx_start=st.integers(), + major_idx_end=st.integers(), + shape=random_length_tuple(elements=st.integers(), max_length=3), + format=st.text(), + indptr=st.from_type(npt.NDArray[Any]), + indices=st.from_type(npt.NDArray[Any]), + data=st.from_type(npt.NDArray[Any]), + out=st.from_type(npt.NDArray[Any]), + context=st.from_type(soma.SOMATileDBContext), +) +@settings(max_examples=250) +def test_fuzz_fastercsx_clib_copy_csx_to_dense( + major_idx_start: int, + major_idx_end: int, + shape: tuple[int, int], + format: str, + indptr: npt.NDArray[Any], + indices: npt.NDArray[Any], + data: npt.NDArray[Any], + out: npt.NDArray[Any], + context: soma.SOMATileDBContext, +) -> None: + # TODO: exclude the rare case that would pass + with pytest.raises(Exception): + clib_fastercsx.copy_csx_to_dense( + context.native_context, + major_idx_start, + major_idx_end, + shape, + format, + indptr, + indices, + data, + out, + ) + + +@given( + do=st.data(), + value_dtype=st.sampled_from(ValueTypes), + unique=st.booleans(), + shape=st.tuples( + st.integers(min_value=0, max_value=1024), + st.integers(min_value=0, max_value=1024), + ), + make_sorted=st.booleans(), + format=st.sampled_from(["csc", "csr"]), + context=st.from_type(soma.SOMATileDBContext), +) +@settings(max_examples=500) +def test_fastercsx_from_ijd( + do: st.DataObject, + value_dtype: np.dtype, + unique: bool, + shape: tuple[int, int], + format: Literal["csc", "csr"], + make_sorted: bool, + context: soma.SOMATileDBContext, +) -> None: + i, j, d = do.draw(coo_ijd(dtype=value_dtype, shape=shape, unique=unique)) + assert all(a.dtype == value_dtype for a in d) + + cm = fastercsx.CompressedMatrix.from_ijd( + i, j, d, shape, format, make_sorted, context + ).to_scipy() + assert cm.dtype == value_dtype + + # compare to oracle + scipy_cm = sparse.coo_matrix( + (np.concatenate(d), (np.concatenate(i), np.concatenate(j))), + shape=shape, + dtype=value_dtype, + ).asformat(format) + assert scipy_cm.has_canonical_format + + if not make_sorted or not unique: + cm.sum_duplicates() + + assert np.array_equal(cm.indptr, scipy_cm.indptr) + assert np.array_equal(cm.indices, scipy_cm.indices) + + # XXX cleanup debug code + assert cm.data.dtype == scipy_cm.data.dtype + if not ( + np.allclose( + cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False + ) + if not unique + else np.array_equal( + cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False + ) + ): + print(cm.data.dtype, scipy_cm.data.dtype) + print(cm.data) + print(scipy_cm.data) + assert ( + np.allclose( + cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False + ) + if not unique + else np.array_equal( + cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False + ) + ) + + +@given( + do=st.data(), + value_dtype=st.sampled_from(ValueTypes), + unique=st.booleans(), + shape=st.tuples( + st.integers(min_value=0, max_value=1024), + st.integers(min_value=0, max_value=1024), + ), + make_sorted=st.booleans(), + format=st.sampled_from(["csc", "csr"]), + context=st.from_type(soma.SOMATileDBContext), +) +@settings(max_examples=500) +def test_fastercsx_to_scipy( + do: st.DataObject, + value_dtype: np.dtype, + unique: bool, + shape: tuple[int, int], + format: Literal["csc", "csr"], + make_sorted: bool, + context: soma.SOMATileDBContext, +) -> None: + i, j, d = do.draw(coo_ijd(dtype=value_dtype, shape=shape, unique=unique)) + cm = fastercsx.CompressedMatrix.from_ijd( + i, j, d, shape, format, make_sorted, context + ) + + # compare to oracle + scipy_cm = sparse.coo_matrix( + (np.concatenate(d), (np.concatenate(i), np.concatenate(j))), + shape=shape, + dtype=value_dtype, + ).asformat(format) + assert scipy_cm.has_canonical_format + + major_index_slice = do.draw(contiguous_slices(shape[0])) + + cm_slc = cm.to_scipy(major_index_slice) + if not make_sorted or not unique: + cm_slc.sum_duplicates() + assert cm_slc.has_canonical_format + + scipy_slc = ( + scipy_cm[major_index_slice] + if format == "csr" + else scipy_cm[:, major_index_slice] + ) + + assert np.array_equal(cm_slc.indptr, scipy_slc.indptr) + assert np.array_equal(cm_slc.indices, scipy_slc.indices) + assert ( + np.allclose( + cm_slc.data, + scipy_slc.data, + equal_nan=True if value_dtype.kind == "f" else False, + ) + if not unique + else np.array_equal( + cm_slc.data, + scipy_slc.data, + equal_nan=True if value_dtype.kind == "f" else False, + ) + ) diff --git a/apis/python/tests/ht/test_ht_indexer.py b/apis/python/tests/ht/test_ht_indexer.py new file mode 100644 index 0000000000..f37ec364ea --- /dev/null +++ b/apis/python/tests/ht/test_ht_indexer.py @@ -0,0 +1,150 @@ +"""Hypothesis tests for IntIndexer module.""" + +from typing import Any, List, Union + +import hypothesis as ht +import hypothesis.extra.numpy as ht_np +import numpy as np +import numpy.typing as npt +import pyarrow as pa +import pytest +from hypothesis import given, settings +from hypothesis import strategies as st + +import tiledbsoma as soma +from tiledbsoma import pytiledbsoma as clib + +from tests.ht._ht_util import ( + arrow_array_fast, + arrow_chunked_array_fast, + everything_except, +) + + +@given( + data=ht_np.arrays( + dtype=np.int64, + shape=ht_np.array_shapes(max_dims=1, max_side=127), + unique=True, + ), + context=st.one_of(st.from_type(soma.SOMATileDBContext), st.none()), +) +def test_IntIndexer_ndarray_lookup( + data: npt.NDArray[Any], context: soma.SOMATileDBContext +) -> None: + assert np.array_equal( + soma.IntIndexer(data=data, context=context).get_indexer(data), + np.arange(0, len(data), dtype=np.int64), + ) + + +@given( + data=st.one_of( + ( + arrow_array_fast( + np.int64, shape=st.integers(min_value=0, max_value=2047), unique=True + ), + arrow_chunked_array_fast( + dtype=np.int64, + shape=st.integers(min_value=0, max_value=1023), + splits=3, + unique=True, + ), + ) + ) +) +@settings( + max_examples=500, suppress_health_check=(ht.HealthCheck.function_scoped_fixture,) +) +def test_IntIndexer_arrow_lookup( + data: pa.ChunkedArray, context: soma.SOMATileDBContext +) -> None: + assert np.array_equal( + soma.IntIndexer(data=data, context=context).get_indexer(data), + np.arange(0, len(data), dtype=np.int64), + ) + + +@given(data=st.from_type(Union[np.ndarray[Any, Any], List[int]])) +@settings( + max_examples=500, + suppress_health_check=(ht.HealthCheck.function_scoped_fixture,), +) +def test_fuzz_IntIndexer( + data: npt.NDArray[Any], context: soma.SOMATileDBContext +) -> None: + if isinstance(data, list): + ht.assume(len(data) > 0 and any(not isinstance(x, int) for x in data)) + elif isinstance(data, np.ndarray): + ht.assume(not (data.ndim == 1 and data.dtype == np.int64)) + with pytest.raises(Exception): + soma.IntIndexer(data=data, context=context) + + +@given( + data=ht_np.arrays( + dtype=np.int64, + shape=ht_np.array_shapes(max_dims=1, max_side=127), + unique=True, + ), +) +@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) +def test_pytiledbsoma_IntIndexer_map_locations( + data: npt.NDArray[np.int64], context: soma.SOMATileDBContext +) -> None: + indexer = clib.IntIndexer(context.native_context) + indexer.map_locations(data) + + +@given( + data=st.one_of( + ( + ht_np.arrays( + dtype=ht_np.array_dtypes(), shape=ht_np.array_shapes(), unique=True + ), + ht_np.arrays( + dtype=ht_np.array_dtypes(), shape=ht_np.array_shapes(), unique=False + ), + st.from_type(float | list | dict | str | bytearray), + ) + ) +) +@settings( + max_examples=250, suppress_health_check=(ht.HealthCheck.function_scoped_fixture,) +) +def test_fuzz_pytiledbsoma_IntIndexer_map_locations( + data: npt.NDArray[Any], context: soma.SOMATileDBContext +) -> None: + ht.assume( + (not isinstance(data, np.ndarray)) or data.dtype != np.int64 or data.ndim != 1 + ) + + indexer = clib.IntIndexer(context.native_context) + with pytest.raises(Exception): + indexer.map_locations(data) + + +@given( + data=st.one_of( + ( + ht_np.arrays( + dtype=ht_np.array_dtypes(), shape=ht_np.array_shapes(), unique=False + ), + everything_except(np.ndarray), + ) + ) +) +@settings( + max_examples=250, suppress_health_check=(ht.HealthCheck.function_scoped_fixture,) +) +def test_fuzz_pytiledbsoma_Indexer_get_indexer_general( + data: Any, context: soma.SOMATileDBContext +) -> None: + ht.assume( + (not isinstance(data, np.ndarray)) or data.dtype != np.int64 or data.ndim != 1 + ) + + indexer = clib.IntIndexer(context.native_context) + indexer.map_locations(np.arange(0, 100, dtype=np.int64)) + with pytest.raises(Exception): + indexer.get_indexer_general(data) diff --git a/apis/python/tests/ht/test_ht_sparsendarray.py b/apis/python/tests/ht/test_ht_sparsendarray.py new file mode 100644 index 0000000000..c56c5f9a3c --- /dev/null +++ b/apis/python/tests/ht/test_ht_sparsendarray.py @@ -0,0 +1,298 @@ +"""Hypothesis tests for SparseNDArray.""" + +from __future__ import annotations + +import datetime +import shutil +import typing +from typing import Any + +import hypothesis as ht +import numpy as np +import pyarrow as pa +import pytest +from hypothesis import given, settings +from hypothesis import strategies as st +from hypothesis.stateful import ( + initialize, + invariant, + precondition, + rule, +) + +import tiledbsoma +import tiledbsoma as soma +import tiledbsoma._sparse_nd_array + +from tests.ht._array_state_machine import SOMANDArrayStateMachine +from tests.ht._ht_util import ( + arrow_array, + ndarray_datatype, + posix_filename, + tables_equal, + tiledb_timestamps, +) +from tests.ht._ledger import ArrowTableLedgerEntry, Ledger, get_entries + + +@st.composite +def sparse_array_shape( + draw: st.DrawFn, + *, + min_shape: tuple[int, ...] | None = None, + max_shape: tuple[int, ...] | None = None, + allow_none: bool = False, +) -> tuple[int | None, ...]: + """Strategy to generate nd array shapes.""" + + MAX_DIMS = 7 + if min_shape is not None: + ndim = len(min_shape) + elif max_shape is not None: + ndim = len(max_shape) + else: + ndim = draw(st.integers(min_value=1, max_value=MAX_DIMS)) + + min_values = [1] * ndim if min_shape is None else min_shape + + # due to how we draw random coordinates in sparse_array() strategy, the product + # of the shape must fit in an int64. SOMA/TileDB has some weird internal restrictions + # that force the max of any one dimension to be a bit smaller. Set our per-dim + # limit to the min of the nth root of int64.max or the tiledb limit. + shape_limit = min(int((2**63 - 1) ** (1 / ndim)), (2**63 - 2050)) + if max_shape is None: + max_values = [shape_limit] * ndim + else: + max_values = [min(shape_limit, s) for s in max_shape] + + if allow_none: + elements = [ + draw( + st.one_of( + st.none(), + st.integers(min_value=min_values[i], max_value=max_values[i]), + ) + ) + for i in range(ndim) + ] + else: + elements = [ + draw( + st.integers(min_value=min_values[i], max_value=max_values[i]) + if min_values[i] < max_values[i] + else st.just(min_values[i]) + ) + for i in range(ndim) + ] + + new_shape = tuple(elements) + + if min_shape is not None: + assert len(new_shape) == len(min_shape) + assert new_shape >= tuple((s or 1) for s in min_shape) + assert np.prod(new_shape) <= 2**63 - 1 + + return new_shape + + +@st.composite +def sparse_array( + draw: st.DrawFn, + shape: tuple[int, ...], + schema: pa.Schema, + *, + density: float | None = None, +) -> pa.Table: + """ + Draw a sparse array with ndim, in SOMA Table format, with types matching + schema, and dimensions within the given shape. + """ + MAX_NNZ = 1 * 1024**2 # caps memory use + + shape_prod = np.prod(shape) + if shape_prod == 0: + return schema.empty_table() + + if density is None: + max_density = MAX_NNZ / shape_prod if shape_prod > MAX_NNZ else 1.0 + assert 0 <= max_density <= 1 + density = draw(st.floats(min_value=0, max_value=max_density)) + + nnz = max(1, int(density * shape_prod)) + assert nnz <= MAX_NNZ, "Sparse array is too large." + + rng = np.random.default_rng(seed=draw(st.integers(min_value=0))) + coords = rng.choice(shape_prod, size=nnz, replace=False) + tbl_dict = {} + for n, n_len in reversed(list(enumerate(shape))): + coords, tbl_dict[f"soma_dim_{n}"] = np.divmod(coords, n_len) + assert np.all(coords == 0) + tbl_dict = dict(reversed(tbl_dict.items())) + + type = schema.field("soma_data").type + tbl_dict["soma_data"] = draw(arrow_array(type, shape=nnz)) + return pa.Table.from_pydict(tbl_dict, schema=schema) + + +@given( + uri=posix_filename(), + type=st.from_type(pa.DataType).filter( + lambda t: ( + pa.types.is_primitive(t) + and not (pa.types.is_timestamp(t) and t.tz is not None) + and not pa.types.is_time(t) + and not pa.types.is_date(t) + and t + not in [ + pa.float16(), + ] + ) + ), + shape=st.lists( + st.one_of(st.none(), st.integers(min_value=1, max_value=2**31 - 1)), + min_size=1, + max_size=10, + ), + platform_config=st.from_type(dict[str, str] | None), + context=st.from_type(soma.SOMATileDBContext | None), + tiledb_timestamp=tiledb_timestamps(), +) +@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) +def test_fuzz_SparseNDArray_create( + tmp_path, + uri: str, + type: pa.DataType, + shape: typing.Sequence[typing.Optional[int]], + platform_config: typing.Dict[str, typing.Mapping[str, Any]] | object | None, + context: tiledbsoma.SOMATileDBContext | None, + tiledb_timestamp: int | datetime.datetime | None, +) -> None: + + try: + fname = (tmp_path / uri).as_posix() + A = soma.SparseNDArray.create( + uri=fname, + type=type, + shape=shape, + platform_config=platform_config, + context=context, + tiledb_timestamp=tiledb_timestamp, + ) + A.close() + + with soma.open(fname, context=context) as A: + assert len(A.schema.types) == len(shape) + 1 + assert A.schema.field("soma_data").type == type + assert A.shape == tuple((s or 1) for s in shape) + assert A.soma_type == "SOMASparseNDArray" + + finally: + shutil.rmtree(tmp_path / uri, ignore_errors=True) + + +class SOMASparseNDArrayStateMachine(SOMANDArrayStateMachine): + + def __init__(self) -> None: + super().__init__(shapes_factory=sparse_array_shape) + + @initialize(type=ndarray_datatype(), shape=sparse_array_shape(allow_none=True)) + def setup(self, type, shape) -> None: + super().setup( + type, + shape, + soma.SparseNDArray.create( + self.uri, + type=type, + shape=shape, + context=self.context, + tiledb_timestamp=None, # no time-travel for now + ), + ) + self.data_ledger = Ledger[ArrowTableLedgerEntry]( + initial_entry=ArrowTableLedgerEntry( + data=self.schema.empty_table(), + timestamp_ms=self.A.tiledb_timestamp_ms, + name="initial entry", + index_columns=[f"soma_dim_{n}" for n in range(len(shape))], + ), + allows_duplicates=False, + ) + + def _array_exists( + uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + ) -> bool: + return soma.SparseNDArray.exists( + uri, context=context, tiledb_timestamp=tiledb_timestamp + ) + + def _array_open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + self.A = soma.SparseNDArray.open( + self.uri, mode=mode, context=self.context, tiledb_timestamp=tiledb_timestamp + ) + + ## + ## --- schema + ## + @precondition(lambda self: not self.closed) + @invariant() + def check_pytypes(self) -> None: + assert isinstance(self.A, soma.SparseNDArray) + assert self.A.soma_type == "SOMASparseNDArray" + assert self.A.is_sparse + + ## + ## --- data + ## + + @precondition(lambda self: not self.closed and self.mode == "r") + @invariant() + def check_read_all(self) -> None: + timestamp_ms = self.A.tiledb_timestamp_ms + sort_order = [(f"soma_dim_{n}", "ascending") for n in range(len(self.shape))] + expected = ( + self.data_ledger.read(timestamp_ms=timestamp_ms) + .to_table() + .sort_by(sort_order) + ) + found = self.A.read().tables().concat().sort_by(sort_order) + assert tables_equal( + found, + expected, + equal_nan=True if pa.types.is_floating(self.type) else False, + ), f"{found}\n is not equal to {expected}" + + @precondition(lambda self: not self.closed and self.mode == "r") + @invariant() + def check_nnz(self) -> None: + expected = len( + self.data_ledger.read(timestamp_ms=self.A.tiledb_timestamp_ms).to_table() + ) + assert expected == self.A.nnz, "NNZ mismatch" + + @precondition(lambda self: not self.closed and self.mode == "w") + @precondition( + lambda self: self.A.tiledb_timestamp_ms not in self.data_ledger.timestamps + ) # only one write per timestamp until sc-61223 and sc-61226 are fixed + @rule(data=st.data()) + def write(self, data: st.DataObject) -> None: + coo_tbl = data.draw(sparse_array(self.shape, self.schema)) + + fragments_before_write = get_entries(f"{self.uri}/__fragments") + self.A.write(coo_tbl) + new_fragments = set(get_entries(f"{self.uri}/__fragments")) - set( + fragments_before_write + ) + assert len(new_fragments) == 1 + self.data_ledger.write( + ArrowTableLedgerEntry( + timestamp_ms=self.A.tiledb_timestamp_ms, + name=new_fragments.pop(), + data=coo_tbl, + index_columns=[f"soma_dim_{n}" for n in range(len(self.shape))], + ) + ) + + +TestSOMASparseNDArray = pytest.mark.usefixtures("make_tmp_dir")( + SOMASparseNDArrayStateMachine.TestCase +) From e767ab27f4ac3dc5644c79b1b2ac43a40e02e028 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 14 Jan 2025 18:24:00 -0800 Subject: [PATCH 02/20] backport to python 3.9 and pandas<2.0 --- apis/python/requirements_dev.txt | 1 + apis/python/tests/ht/_array_state_machine.py | 7 +- apis/python/tests/ht/_ht_util.py | 87 ++++++++++--------- apis/python/tests/ht/conftest.py | 2 + apis/python/tests/ht/test_ht_dataframe.py | 17 ++-- apis/python/tests/ht/test_ht_fastercsx.py | 7 +- apis/python/tests/ht/test_ht_indexer.py | 2 +- apis/python/tests/ht/test_ht_sparsendarray.py | 6 +- 8 files changed, 74 insertions(+), 55 deletions(-) diff --git a/apis/python/requirements_dev.txt b/apis/python/requirements_dev.txt index 959beea239..cf313c6601 100644 --- a/apis/python/requirements_dev.txt +++ b/apis/python/requirements_dev.txt @@ -5,4 +5,5 @@ ruff sparse typeguard==4.4.0 types-setuptools +more-itertools hypothesis diff --git a/apis/python/tests/ht/_array_state_machine.py b/apis/python/tests/ht/_array_state_machine.py index e34ad982b1..c49813d3fa 100644 --- a/apis/python/tests/ht/_array_state_machine.py +++ b/apis/python/tests/ht/_array_state_machine.py @@ -7,18 +7,19 @@ import re from abc import abstractmethod -from typing import Any, Literal, Protocol, TypeAlias +from typing import Any, Literal, Protocol, Union import numpy as np import pyarrow as pa from hypothesis import strategies as st from hypothesis.stateful import RuleBasedStateMachine, invariant, precondition, rule +from typing_extensions import TypeAlias import tiledbsoma as soma from tests.ht._ht_test_config import HT_TEST_CONFIG -SOMAArray: TypeAlias = soma.DataFrame | soma.SparseNDArray | soma.DenseNDArray +SOMAArray: TypeAlias = Union[soma.DataFrame, soma.SparseNDArray, soma.DenseNDArray] class SOMAArrayStateMachine(RuleBasedStateMachine): @@ -39,7 +40,7 @@ def __init__(self) -> None: self.initial_metadata_keys: set[str] = set() def setup(self, A: SOMAArray) -> None: - assert isinstance(A, SOMAArray) + assert isinstance(A, (soma.DataFrame, soma.SparseNDArray, soma.DenseNDArray)) assert A.mode == "w" and not A.closed self.A = A self.create_timestamp_ms = self.A.tiledb_timestamp_ms diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py index 07b22047e3..226820e213 100644 --- a/apis/python/tests/ht/_ht_util.py +++ b/apis/python/tests/ht/_ht_util.py @@ -3,7 +3,6 @@ from __future__ import annotations import datetime -from itertools import pairwise from typing import Any, Mapping, Sequence import hypothesis.extra.numpy as ht_np @@ -12,6 +11,8 @@ import pandas as pd import pyarrow as pa from hypothesis import strategies as st +from more_itertools import pairwise +from packaging.version import Version from tests.ht._ht_test_config import HT_TEST_CONFIG @@ -119,10 +120,19 @@ def arrow_floating_datatypes(draw: st.DrawFn) -> pa.DataType: return draw(st.sampled_from((pa.float16(), pa.float32(), pa.float64()))) +# pyarrow and pandas timestamp interop lacks support for anything other than `ns` +# units prior to pandas 2. For info, see https://github.com/apache/arrow/issues/33321 +# The simple solution is to just to restrict types to 'ns' for pandas<2. +if Version(pd.__version__) >= Version("2.0.0"): + TIMESTAMP_UNITS = ("s", "ms", "us", "ns") +else: + TIMESTAMP_UNITS = ("ns",) + + @st.composite def arrow_timestamp_datatypes(draw: st.DrawFn) -> pa.DataType: return pa.timestamp( - unit=draw(st.sampled_from(("s", "ms", "us", "ns"))), + unit=draw(st.sampled_from(TIMESTAMP_UNITS)), tz=draw(st.sampled_from((None, "UTC", "Europe/London"))), ) @@ -372,51 +382,48 @@ def gen_unique_floats( length = shape[0] rng = np.random.default_rng(seed=draw(st.integers(min_value=0))) - match dtype.kind: - case "f": - low = min_value if min_value is not None else -np.finfo(dtype).max / 2 - high = max_value if max_value is not None else np.finfo(dtype).max / 2 - if unique: - nparr = gen_unique_floats(rng, low, high, length).astype(dtype) + if dtype.kind == "f": + low = min_value if min_value is not None else -np.finfo(dtype).max / 2 + high = max_value if max_value is not None else np.finfo(dtype).max / 2 + if unique: + nparr = gen_unique_floats(rng, low, high, length).astype(dtype) + else: + nparr = rng.uniform(low, high=high, size=length).astype(dtype) + + elif dtype.kind == "i" or dtype.kind == "u": + # RNG draws max of int64 + low = int(min_value) if min_value is not None else -np.iinfo(dtype).max + high = int(max_value) if max_value is not None else np.iinfo(dtype).max + if (high - low) < np.iinfo(np.int64).max: + if high > low: + nparr = rng.choice(high - low, size=length, replace=(not unique)) else: - nparr = rng.uniform(low, high=high, size=length).astype(dtype) - - case "i" | "u": - # RNG draws max of int64 - low = int(min_value) if min_value is not None else -np.iinfo(dtype).max - high = int(max_value) if max_value is not None else np.iinfo(dtype).max - if (high - low) < np.iinfo(np.int64).max: - if high > low: - nparr = rng.choice(high - low, size=length, replace=(not unique)) - else: - nparr = np.full(shape=shape, fill_value=low, dtype=dtype) + nparr = np.full(shape=shape, fill_value=low, dtype=dtype) + nparr += low + else: + nparr = rng.choice( + np.iinfo(np.int64).max, size=length, replace=(not unique) + ) + if min_value is not None: nparr += low else: - nparr = rng.choice( - np.iinfo(np.int64).max, size=length, replace=(not unique) - ) - if min_value is not None: - nparr += low - else: - nparr -= np.iinfo(dtype).max // 2 + nparr -= np.iinfo(dtype).max // 2 - nparr = nparr.astype(dtype) + nparr = nparr.astype(dtype) - case "M": - # TODO: implement min_value/max_value - assert min_value is None and max_value is None - nparr = rng.choice( - np.iinfo(np.int64).max, size=length, replace=(not unique) - ) - nparr = nparr.astype(dtype) + elif dtype.kind == "M": + # TODO: implement min_value/max_value + assert min_value is None and max_value is None + nparr = rng.choice(np.iinfo(np.int64).max, size=length, replace=(not unique)) + nparr = nparr.astype(dtype) - case "b": - # TODO: implement min_value/max_value - assert min_value is None and max_value is None - nparr = rng.choice([True, False], size=length, replace=(not unique)) + elif dtype.kind == "b": + # TODO: implement min_value/max_value + assert min_value is None and max_value is None + nparr = rng.choice([True, False], size=length, replace=(not unique)) - case _: - raise TypeError(f"Unsupported dtype: {dtype}") + else: + raise TypeError(f"Unsupported dtype: {dtype}") return pad_array(nparr, draw) if padding else pa.array(nparr) diff --git a/apis/python/tests/ht/conftest.py b/apis/python/tests/ht/conftest.py index 6e26a7808e..5d34a2ac6c 100644 --- a/apis/python/tests/ht/conftest.py +++ b/apis/python/tests/ht/conftest.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import hypothesis as ht diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index 431e104882..a968a970ee 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -2,16 +2,18 @@ from __future__ import annotations -from itertools import pairwise -from typing import Any, Sequence +from typing import Any, Sequence, Union import numpy as np +import pandas as pd import pyarrow as pa import pytest from hypothesis import strategies as st from hypothesis.extra import numpy as ht_np from hypothesis.extra import pandas as ht_pd from hypothesis.stateful import initialize, invariant, precondition, rule +from more_itertools import pairwise +from packaging.version import Version import tiledbsoma as soma @@ -44,13 +46,16 @@ pa.large_binary(), pa.string(), pa.large_string(), - pa.timestamp("s"), - pa.timestamp("ms"), - pa.timestamp("us"), pa.timestamp("ns"), ] +if Version(pd.__version__) >= Version("2.0.0"): + DataFrameIndexTypes += [ + pa.timestamp("s"), + pa.timestamp("ms"), + pa.timestamp("us"), + ] -AxisDomain = None | tuple[Any, Any] | list[Any] +AxisDomain = Union[None, tuple[Any, Any], list[Any]] Domain = Sequence[AxisDomain] diff --git a/apis/python/tests/ht/test_ht_fastercsx.py b/apis/python/tests/ht/test_ht_fastercsx.py index 03d80439c2..13e67874b3 100644 --- a/apis/python/tests/ht/test_ht_fastercsx.py +++ b/apis/python/tests/ht/test_ht_fastercsx.py @@ -1,6 +1,8 @@ """Hypothesis tests for fastercsx module.""" -from typing import Any, Literal, TypeAlias +from __future__ import annotations + +from typing import Any, Literal, Union import numpy as np import numpy.typing as npt @@ -9,6 +11,7 @@ import scipy.sparse as sparse from hypothesis import given, settings from hypothesis import strategies as st +from typing_extensions import TypeAlias import tiledbsoma as soma import tiledbsoma._fastercsx as fastercsx @@ -44,7 +47,7 @@ ) NDArrayIndex: TypeAlias = npt.NDArray[np.integer[Any]] -NDArrayNumber: TypeAlias = npt.NDArray[np.integer[Any] | np.floating[Any]] +NDArrayNumber: TypeAlias = npt.NDArray[Union[np.integer[Any], np.floating[Any]]] def limit_value_range_element_strategy( diff --git a/apis/python/tests/ht/test_ht_indexer.py b/apis/python/tests/ht/test_ht_indexer.py index f37ec364ea..9f33093241 100644 --- a/apis/python/tests/ht/test_ht_indexer.py +++ b/apis/python/tests/ht/test_ht_indexer.py @@ -105,7 +105,7 @@ def test_pytiledbsoma_IntIndexer_map_locations( ht_np.arrays( dtype=ht_np.array_dtypes(), shape=ht_np.array_shapes(), unique=False ), - st.from_type(float | list | dict | str | bytearray), + st.from_type(Union[float, list, dict, str, bytearray]), ) ) ) diff --git a/apis/python/tests/ht/test_ht_sparsendarray.py b/apis/python/tests/ht/test_ht_sparsendarray.py index c56c5f9a3c..738b5b9051 100644 --- a/apis/python/tests/ht/test_ht_sparsendarray.py +++ b/apis/python/tests/ht/test_ht_sparsendarray.py @@ -5,7 +5,7 @@ import datetime import shutil import typing -from typing import Any +from typing import Any, Union import hypothesis as ht import numpy as np @@ -153,8 +153,8 @@ def sparse_array( min_size=1, max_size=10, ), - platform_config=st.from_type(dict[str, str] | None), - context=st.from_type(soma.SOMATileDBContext | None), + platform_config=st.from_type(Union[dict[str, str], None]), + context=st.from_type(Union[soma.SOMATileDBContext, None]), tiledb_timestamp=tiledb_timestamps(), ) @settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) From 2401bce938ff21b83d3cbacf57381929a2d755e3 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 15 Jan 2025 13:04:00 -0800 Subject: [PATCH 03/20] remove metadata work-arounds --- apis/python/tests/ht/_array_state_machine.py | 32 ++++---------------- apis/python/tests/ht/_ht_test_config.py | 6 ---- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/apis/python/tests/ht/_array_state_machine.py b/apis/python/tests/ht/_array_state_machine.py index c49813d3fa..059e1d3b7c 100644 --- a/apis/python/tests/ht/_array_state_machine.py +++ b/apis/python/tests/ht/_array_state_machine.py @@ -140,39 +140,19 @@ def reopen(self, mode: str) -> None: ## ## --- metadata ## - # TODO: sc-61092 causes SOMA to fail on writing a metadata value with a non-ASCII codepoint. - # TODO: due to sc-61093, zero length bytes and strings are mishandled (not written correctly). Remove the `min_size` when fixed. - # TODO: due to sc-61094, strings containing a zero code point also fail. - - METADATA_KEY_ALPHABET = ( - st.characters(codec="utf-8", exclude_characters=["\x00"]) - if HT_TEST_CONFIG["sc-61094_workaround"] - else st.characters(codec="utf-8") - ) - METADATA_KEYS = st.text(min_size=1, max_size=4096, alphabet=METADATA_KEY_ALPHABET) - - METADATA_VALUE_ALPHABET = ( - st.characters(codec="ascii", exclude_characters=["\x00"]) - if ( - HT_TEST_CONFIG["sc-61092_workaround"] - or HT_TEST_CONFIG["sc-61094_workaround"] - ) - else st.characters(codepoint="utf-8") - ) + METADATA_KEY_ALPHABET = st.characters(codec="utf-8", exclude_characters=["\x00"]) + METADATA_KEYS = st.text(min_size=0, max_size=4096, alphabet=METADATA_KEY_ALPHABET) + METADATA_VALUE_ALPHABET = st.characters(codec="utf-8", exclude_characters=["\x00"]) METADATA_VALUES = st.one_of( - st.text( - alphabet=METADATA_VALUE_ALPHABET, - min_size=1 if HT_TEST_CONFIG["sc-61093_workaround"] else 0, - ) + st.text(alphabet=METADATA_VALUE_ALPHABET, min_size=0) | st.integers( min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max ) | st.floats( allow_nan=False - ) # FIXME: disabled NaNs make assertions easier (they are supported) + ) # FIXME: disabled NaNs make assertions easier (they are supported and we should test!) ) - - IGNORE_KEYS = re.compile(r"^soma_dim_[0-9]+_domain_(upper|lower)$") + IGNORE_KEYS = re.compile(r"^soma_dim_.*$") @precondition(lambda self: not self.closed) @invariant() diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index 3250820a65..ee44d5548f 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -17,12 +17,6 @@ "sc-61123_workaround": True, # reopen w->r loses all metadata modifications "sc-61118_workaround": True, - # metadata VALUES with non-ASCII codepoints generate an error - "sc-61092_workaround": True, - # Zero-length strings as a metadata value are stored incorrectly - "sc-61093_workaround": True, - # metadata keys with a zero codepoint are saved as empty string - "sc-61094_workaround": True, # dataframe column names of \x00 silently mutated to empty Python string "sc-61291_workaround": True, # DataFrame.write creates 1+ fragments (one per table chunk) From a043fe1c1d68f744d4219541b73cf0b01088dbc8 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 16 Jan 2025 07:07:50 -0800 Subject: [PATCH 04/20] add metadata time travel ledger --- apis/python/tests/ht/_array_state_machine.py | 79 +++++++++++++------- apis/python/tests/ht/_ledger.py | 27 ++++++- 2 files changed, 79 insertions(+), 27 deletions(-) diff --git a/apis/python/tests/ht/_array_state_machine.py b/apis/python/tests/ht/_array_state_machine.py index 059e1d3b7c..5340309612 100644 --- a/apis/python/tests/ht/_array_state_machine.py +++ b/apis/python/tests/ht/_array_state_machine.py @@ -5,6 +5,7 @@ from __future__ import annotations +import math import re from abc import abstractmethod from typing import Any, Literal, Protocol, Union @@ -18,6 +19,7 @@ import tiledbsoma as soma from tests.ht._ht_test_config import HT_TEST_CONFIG +from tests.ht._ledger import Ledger, PyDictLedgerEntry SOMAArray: TypeAlias = Union[soma.DataFrame, soma.SparseNDArray, soma.DenseNDArray] @@ -34,10 +36,6 @@ def __init__(self) -> None: self.uri = self.TestCase.tmp_path_factory.mktemp( f"{self.__class__.__name__}-" ).as_posix() - self.metadata: dict[str, Any] = ( - {} - ) # XXX TODO: should be a ledger to allow for time travel - self.initial_metadata_keys: set[str] = set() def setup(self, A: SOMAArray) -> None: assert isinstance(A, (soma.DataFrame, soma.SparseNDArray, soma.DenseNDArray)) @@ -46,8 +44,15 @@ def setup(self, A: SOMAArray) -> None: self.create_timestamp_ms = self.A.tiledb_timestamp_ms self.closed = self.A.closed self.mode = self.A.mode - self.metadata = dict(self.A.metadata) - self.initial_metadata_keys = set(self.metadata) + self.metadata_ledger = Ledger[PyDictLedgerEntry]( + initial_entry=PyDictLedgerEntry( + data=dict(self.A.metadata), + timestamp_ms=self.A.tiledb_timestamp_ms, + name="initial entry", + ), + allows_duplicates=False, + ) + self.pending_metadata: dict[str, Any] | None = None def teardown(self) -> None: if self.A is not None: @@ -81,6 +86,12 @@ def _open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: def _close(self) -> None: assert not self.A.closed + if self.pending_metadata is not None: + self.metadata_ledger.write( + PyDictLedgerEntry(self.A.tiledb_timestamp_ms, "", self.pending_metadata) + ) + self.pending_metadata = None + self.A.close() self.closed = True self.mode = None @@ -141,56 +152,72 @@ def reopen(self, mode: str) -> None: ## --- metadata ## METADATA_KEY_ALPHABET = st.characters(codec="utf-8", exclude_characters=["\x00"]) - METADATA_KEYS = st.text(min_size=0, max_size=4096, alphabet=METADATA_KEY_ALPHABET) + METADATA_KEYS = st.text( + min_size=0, max_size=4096, alphabet=METADATA_KEY_ALPHABET + ).filter(lambda k: not k.startswith("soma_")) METADATA_VALUE_ALPHABET = st.characters(codec="utf-8", exclude_characters=["\x00"]) METADATA_VALUES = st.one_of( st.text(alphabet=METADATA_VALUE_ALPHABET, min_size=0) | st.integers( min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max ) - | st.floats( - allow_nan=False - ) # FIXME: disabled NaNs make assertions easier (they are supported and we should test!) + | st.floats() ) - IGNORE_KEYS = re.compile(r"^soma_dim_.*$") + IGNORE_KEYS = re.compile(r"^soma_.*$") + + @classmethod + def filter_metadata(cls, d: dict[str, Any]) -> dict[str, Any]: + """Apply the "ignore" regex to dict keys, returning the filtered dict.""" + return {k: v for k, v in d.items() if not cls.IGNORE_KEYS.match(k)} @precondition(lambda self: not self.closed) @invariant() def check_metadata(self) -> None: - # Prior to tiledbsoma 1.16, the "used domain" keys were still included. Ignore them. - # TODO: we could generalize this by removing _all_ keys that are reserved soma_* keys. - array_metadata = { - k: v for k, v in self.A.metadata.items() if not self.IGNORE_KEYS.match(k) - } - assert array_metadata == self.metadata + array_metadata = self.filter_metadata(dict(self.A.metadata)) + expected_metadata = self.filter_metadata( + self.metadata_ledger.read(timestamp_ms=self.A.tiledb_timestamp_ms).to_dict() + if self.pending_metadata is None + else self.pending_metadata + ) + assert set(array_metadata.keys()) == set(expected_metadata.keys()) + for k in array_metadata.keys(): + if isinstance(array_metadata[k], float) and math.isnan(array_metadata[k]): + assert math.isnan(expected_metadata[k]) + continue + assert array_metadata[k] == expected_metadata[k] @precondition( - lambda self: not self.closed and self.mode == "w" and len(self.metadata) < 100 + lambda self: not self.closed and self.mode == "w" and len(self.A.metadata) < 100 ) @rule(k=METADATA_KEYS, v=METADATA_VALUES) def set_metadata(self, k: str, v: str | int | float) -> None: - self.metadata[k] = v self.A.metadata[k] = v + if self.pending_metadata is None: + self.pending_metadata = self.metadata_ledger.read( + self.A.tiledb_timestamp_ms + ).to_dict() + self.pending_metadata[k] = v @precondition( lambda self: not self.closed and self.mode == "w" - and len(self.metadata) > len(self.initial_metadata_keys) + and len(self.filter_metadata(self.A.metadata)) ) @precondition(lambda self: not self.closed) @rule(data=st.data()) def del_metadata(self, data: st.DataObject) -> None: + if self.pending_metadata is None: + self.pending_metadata = self.metadata_ledger.read( + self.A.tiledb_timestamp_ms + ).to_dict() + k = data.draw( st.sampled_from( - [ - kn - for kn in self.metadata.keys() - if kn not in self.initial_metadata_keys - ] + sorted(list(self.filter_metadata(self.pending_metadata).keys())) ) ) - del self.metadata[k] del self.A.metadata[k] + del self.pending_metadata[k] class ShapesFactory(Protocol): diff --git a/apis/python/tests/ht/_ledger.py b/apis/python/tests/ht/_ledger.py index 921ff6aa61..9a49a20b2a 100644 --- a/apis/python/tests/ht/_ledger.py +++ b/apis/python/tests/ht/_ledger.py @@ -5,7 +5,7 @@ import pathlib import re from abc import ABCMeta, abstractmethod -from typing import Generic, Sequence, TypeVar +from typing import Any, Generic, Sequence, TypeVar import numpy as np import pandas as pd @@ -186,6 +186,31 @@ def to_numpy(self) -> np.ndarray: return self.data.to_numpy() +class PyDictLedgerEntry(LedgerEntry[dict[str, Any]]): + """Ledger entry based upon a Python dictionary.""" + + def __init__( + self, + timestamp_ms: int, + name: str, + data: dict[str, Any], + ) -> None: + super().__init__(timestamp_ms, name, data) + + def __repr__(self) -> str: + return f"PyDictLedgerEntry(timestamp_ms={self.timestamp_ms}):\n{self.data}" + + def consolidate_with( + self, other: PyDictLedgerEntry, allow_duplicates: bool + ) -> PyDictLedgerEntry: + assert not allow_duplicates, "Unsupported" + assert (self.timestamp_ms, self.name) < (other.timestamp_ms, other.name) + return other + + def to_dict(self) -> dict[str, Any]: + return self.data + + def combine_first(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame: """Combine dataframes - similar to pandas.DataFrame.combine_first, except fixes pandas#60128 and ignores NA values (they are copied as is). From b7e5014d8beeaf16d6f55bc3b5b676ec7995bb67 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 16 Jan 2025 07:10:03 -0800 Subject: [PATCH 05/20] add readme --- apis/python/tests/ht/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 apis/python/tests/ht/README.md diff --git a/apis/python/tests/ht/README.md b/apis/python/tests/ht/README.md new file mode 100644 index 0000000000..9a230219f5 --- /dev/null +++ b/apis/python/tests/ht/README.md @@ -0,0 +1,19 @@ +# SOMA Hypothesis-based tests + +This folder contains Hypothesis-based tests and supporting code. All will run within the standard pytest +framework and will run in the course of normal pytest execution. + +## Configuration + +The default configuration is suitable for use in CI, i.e., run fairly quickly. Please do not +change this behavior. + +In the course of development, it is often useful to more exhaustively search for test cases. +A Hypothesis profile has been defined for this case called `expensive`. You can run the tests in this +mode: + +> pytest tests/ --hypothesis-profile=expensive + +## For More Information + +See the [Hypothesis documentation](https://hypothesis.readthedocs.io/) From 9e708c7600196de169fd246ecd7f925e1aeb670a Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 16 Jan 2025 11:21:05 -0800 Subject: [PATCH 06/20] fix numeric overflow in fastercsx test --- apis/python/tests/ht/README.md | 2 + apis/python/tests/ht/test_ht_fastercsx.py | 62 +++++------------------ apis/python/tests/ht/test_ht_indexer.py | 17 ++----- 3 files changed, 19 insertions(+), 62 deletions(-) diff --git a/apis/python/tests/ht/README.md b/apis/python/tests/ht/README.md index 9a230219f5..def3e892f0 100644 --- a/apis/python/tests/ht/README.md +++ b/apis/python/tests/ht/README.md @@ -14,6 +14,8 @@ mode: > pytest tests/ --hypothesis-profile=expensive +In this mode, tests will run signicantly longer (very roughly, 100X longer than the default). + ## For More Information See the [Hypothesis documentation](https://hypothesis.readthedocs.io/) diff --git a/apis/python/tests/ht/test_ht_fastercsx.py b/apis/python/tests/ht/test_ht_fastercsx.py index 13e67874b3..47552bf263 100644 --- a/apis/python/tests/ht/test_ht_fastercsx.py +++ b/apis/python/tests/ht/test_ht_fastercsx.py @@ -9,7 +9,7 @@ import pyarrow as pa import pytest import scipy.sparse as sparse -from hypothesis import given, settings +from hypothesis import given from hypothesis import strategies as st from typing_extensions import TypeAlias @@ -50,15 +50,13 @@ NDArrayNumber: TypeAlias = npt.NDArray[Union[np.integer[Any], np.floating[Any]]] -def limit_value_range_element_strategy( - dtype: np.dtype, divisor: float -) -> dict[str, Any] | None: +def limit_value_range_element_strategy(dtype: np.dtype) -> dict[str, Any] | None: if dtype.kind == "f": info = np.finfo(dtype) - return {"min_value": -info.max / divisor, "max_value": info.max / divisor} + return {"min_value": -1.0, "max_value": 1.0} if dtype.kind in ["i", "u"]: info = np.iinfo(dtype) - return {"min_value": info.min // divisor, "max_value": info.max // divisor} + return {"min_value": info.min // 128, "max_value": info.max // 128} return None @@ -92,6 +90,14 @@ def coo_ijd( can easily cause some types to overflow, and others to lose precision, which makes equality comparisons tricky. + In addition, there is no (known) guarantee in scipy.sparse as to the + order of operations when summing dups. For floating point values, this + can result in cases where the sum is dependent on the order of data. + A concrete example might be a case where there are three dups at a single + coordinate: + max(float64) + 1.0 - max(float64) -> 0.0 + max(float64) - max(float64) + 1.0 -> 1.0 + To avoid this, ONLY when `not unique`, constrain the range of generated values to a very limited range (currently 1/128th of the full range). This is extremely unlikely to overflow as it would require 128 identical @@ -124,7 +130,7 @@ def coo_ijd( arrow_array( dtype=dtype, shape=nnz, - elements=limit_value_range_element_strategy(dtype, 128), + elements=limit_value_range_element_strategy(dtype), ) ) @@ -160,7 +166,6 @@ def coo_ijd( ), context=st.from_type(soma.SOMATileDBContext), ) -@settings(max_examples=500) def test_fastercsx_clib_compress_coo( do: st.DataObject, value_dtype: np.dtype, @@ -199,28 +204,6 @@ def test_fastercsx_clib_compress_coo( assert np.array_equal(csr.indptr, scipy_csr.indptr) assert np.array_equal(csr.indices, scipy_csr.indices) - - # XXX the non-unique case has several issues: - # - dups are added, which can overflow - # - dups are added, which for floats may not be exactly eq in some situations - # XXX cleanup debug code - if not ( - np.allclose( - csr.data, - scipy_csr.data, - equal_nan=True if value_dtype.kind == "f" else False, - ) - if not unique - else np.array_equal( - csr.data, - scipy_csr.data, - equal_nan=True if value_dtype.kind == "f" else False, - ) - ): - print(csr.data.dtype, scipy_csr.data.dtype) - print(csr.data) - print(scipy_csr.data) - print(csr.data - scipy_csr.data) assert ( np.allclose( csr.data, @@ -273,7 +256,6 @@ def test_fuzz_fastercsx_clib_compress_coo( data=st.from_type(npt.NDArray[Any]).filter(lambda a: a.dtype not in ValueTypes), context=st.from_type(soma.SOMATileDBContext), ) -@settings(max_examples=250) def test_fuzz_fastercsx_clib_sort_csx_indices( indptr: npt.NDArray[Any], indices: npt.NDArray[Any], @@ -296,7 +278,6 @@ def test_fuzz_fastercsx_clib_sort_csx_indices( out=st.from_type(npt.NDArray[Any]), context=st.from_type(soma.SOMATileDBContext), ) -@settings(max_examples=250) def test_fuzz_fastercsx_clib_copy_csx_to_dense( major_idx_start: int, major_idx_end: int, @@ -335,7 +316,6 @@ def test_fuzz_fastercsx_clib_copy_csx_to_dense( format=st.sampled_from(["csc", "csr"]), context=st.from_type(soma.SOMATileDBContext), ) -@settings(max_examples=500) def test_fastercsx_from_ijd( do: st.DataObject, value_dtype: np.dtype, @@ -366,21 +346,6 @@ def test_fastercsx_from_ijd( assert np.array_equal(cm.indptr, scipy_cm.indptr) assert np.array_equal(cm.indices, scipy_cm.indices) - - # XXX cleanup debug code - assert cm.data.dtype == scipy_cm.data.dtype - if not ( - np.allclose( - cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False - ) - if not unique - else np.array_equal( - cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False - ) - ): - print(cm.data.dtype, scipy_cm.data.dtype) - print(cm.data) - print(scipy_cm.data) assert ( np.allclose( cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False @@ -404,7 +369,6 @@ def test_fastercsx_from_ijd( format=st.sampled_from(["csc", "csr"]), context=st.from_type(soma.SOMATileDBContext), ) -@settings(max_examples=500) def test_fastercsx_to_scipy( do: st.DataObject, value_dtype: np.dtype, diff --git a/apis/python/tests/ht/test_ht_indexer.py b/apis/python/tests/ht/test_ht_indexer.py index 9f33093241..1b4541c9c3 100644 --- a/apis/python/tests/ht/test_ht_indexer.py +++ b/apis/python/tests/ht/test_ht_indexer.py @@ -53,9 +53,7 @@ def test_IntIndexer_ndarray_lookup( ) ) ) -@settings( - max_examples=500, suppress_health_check=(ht.HealthCheck.function_scoped_fixture,) -) +@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) def test_IntIndexer_arrow_lookup( data: pa.ChunkedArray, context: soma.SOMATileDBContext ) -> None: @@ -66,10 +64,7 @@ def test_IntIndexer_arrow_lookup( @given(data=st.from_type(Union[np.ndarray[Any, Any], List[int]])) -@settings( - max_examples=500, - suppress_health_check=(ht.HealthCheck.function_scoped_fixture,), -) +@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) def test_fuzz_IntIndexer( data: npt.NDArray[Any], context: soma.SOMATileDBContext ) -> None: @@ -109,9 +104,7 @@ def test_pytiledbsoma_IntIndexer_map_locations( ) ) ) -@settings( - max_examples=250, suppress_health_check=(ht.HealthCheck.function_scoped_fixture,) -) +@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) def test_fuzz_pytiledbsoma_IntIndexer_map_locations( data: npt.NDArray[Any], context: soma.SOMATileDBContext ) -> None: @@ -134,9 +127,7 @@ def test_fuzz_pytiledbsoma_IntIndexer_map_locations( ) ) ) -@settings( - max_examples=250, suppress_health_check=(ht.HealthCheck.function_scoped_fixture,) -) +@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) def test_fuzz_pytiledbsoma_Indexer_get_indexer_general( data: Any, context: soma.SOMATileDBContext ) -> None: From 4ee1235a925c2cbddfc384e08d6e218fc15725ab Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Thu, 16 Jan 2025 17:45:21 -0800 Subject: [PATCH 07/20] remove sensitivity to another numeric precision corner case --- apis/python/tests/ht/_ht_test_config.py | 1 + apis/python/tests/ht/test_ht_fastercsx.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index ee44d5548f..2f5210dca4 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -6,6 +6,7 @@ # Defect work-arounds, while awaiting a fix # # data corruption due to incorrect Arrow array offset handling + # See also sc-62104 "sc-61239_workaround": True, # creating array with timestamp==0 fails in 1.15 (regression) "sc-61054_workaround": True, diff --git a/apis/python/tests/ht/test_ht_fastercsx.py b/apis/python/tests/ht/test_ht_fastercsx.py index 47552bf263..03ab72344d 100644 --- a/apis/python/tests/ht/test_ht_fastercsx.py +++ b/apis/python/tests/ht/test_ht_fastercsx.py @@ -158,8 +158,8 @@ def coo_ijd( @given( do=st.data(), - value_dtype=st.sampled_from(ValueTypes), - unique=st.booleans(), + value_dtype=st.just(np.dtype(np.float32)), # st.sampled_from(ValueTypes), + unique=st.just(False), # st.booleans(), shape=st.tuples( st.integers(min_value=0, max_value=1024), st.integers(min_value=0, max_value=1024), @@ -209,6 +209,8 @@ def test_fastercsx_clib_compress_coo( csr.data, scipy_csr.data, equal_nan=True if value_dtype.kind == "f" else False, + atol=1e-07, + rtol=1e-05, ) if not unique else np.array_equal( @@ -348,7 +350,11 @@ def test_fastercsx_from_ijd( assert np.array_equal(cm.indices, scipy_cm.indices) assert ( np.allclose( - cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False + cm.data, + scipy_cm.data, + equal_nan=True if value_dtype.kind == "f" else False, + atol=1e-07, + rtol=1e-05, ) if not unique else np.array_equal( @@ -411,6 +417,8 @@ def test_fastercsx_to_scipy( cm_slc.data, scipy_slc.data, equal_nan=True if value_dtype.kind == "f" else False, + atol=1e-07, + rtol=1e-05, ) if not unique else np.array_equal( From 9bcdce98d28d8e9f95e7fe47301d128821229de2 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 17 Jan 2025 15:32:50 -0800 Subject: [PATCH 08/20] increase scope of fastercsx coords tested --- apis/python/tests/ht/test_ht_fastercsx.py | 67 +++++++++++++---------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/apis/python/tests/ht/test_ht_fastercsx.py b/apis/python/tests/ht/test_ht_fastercsx.py index 03ab72344d..b636b47f0a 100644 --- a/apis/python/tests/ht/test_ht_fastercsx.py +++ b/apis/python/tests/ht/test_ht_fastercsx.py @@ -66,7 +66,7 @@ def coo_ijd( dtype: npt.DTypeLike | pa.DataType | st.SearchStrategy[npt.DTypeLike | pa.DataType], shape: tuple[int, int] | st.SearchStrategy[tuple[int, int]], *, - density: float | st.SearchStrategy[float] = 0.01, + density: float | st.SearchStrategy[float] | None = None, unique: bool = False, ) -> tuple[ tuple[npt.NDArray[Any], ...], @@ -77,33 +77,41 @@ def coo_ijd( dtype = resolve_dtype(draw, dtype) shape = draw(shape) if isinstance(shape, st.SearchStrategy) else shape assert isinstance(shape, tuple) and len(shape) == 2 + if density is None: + nnz = draw(st.integers(min_value=0, max_value=min(np.prod(shape), 2**18))) + elif isinstance(density, st.SearchStrategy): + density = draw(density) + assert isinstance(density, float) and 0 < density <= 1 + nnz = int(shape[0] * shape[1] * density) + else: + assert isinstance(density, float) and 0 < density <= 1 + nnz = int(shape[0] * shape[1] * density) - density = draw(density) if isinstance(density, st.SearchStrategy) else density - assert isinstance(density, float) and 0 < density <= 1 - - nnz = int(shape[0] * shape[1] * density) coord_dtype = draw(st.sampled_from(CooIndexTypes)) """ - if not unique, we need to be cognizant of the potential to overflow - when duplicates are summed (the default behavior for `to_scipy`). This - can easily cause some types to overflow, and others to lose precision, - which makes equality comparisons tricky. + if not unique, we need to be cognizant of the potential to overflow or + have precision-related issues when duplicates are summed. Most types can + overflow, and floating point types have finite precision, making comparison + of "summed dups" tricky. In addition, there is no (known) guarantee in scipy.sparse as to the order of operations when summing dups. For floating point values, this can result in cases where the sum is dependent on the order of data. - A concrete example might be a case where there are three dups at a single + An extreme example might be a case where there are three dups at a single coordinate: max(float64) + 1.0 - max(float64) -> 0.0 max(float64) - max(float64) + 1.0 -> 1.0 + This can also occur in situations where overflow does not occur (the + difference in sum is due to limitations of precision). To avoid this, ONLY when `not unique`, constrain the range of generated - values to a very limited range (currently 1/128th of the full range). - This is extremely unlikely to overflow as it would require 128 identical - coordinates to be drawn. + values: currently 1/128th of the full range for integral scalars, and + [-1,1] for floating point scalars. This removes the likelihood of overflow + errors, but DOES NOT remove the precision-related issues for floats. - In the case of `unique`, draw from the full range for the type. + In the case of `unique`, draw from the full range for the type as there + will be no dups (no summing). Currently, the only edge case that fails to do the right thing is timestamp generation (datetime64), as the underlying search strategy used does not @@ -158,11 +166,11 @@ def coo_ijd( @given( do=st.data(), - value_dtype=st.just(np.dtype(np.float32)), # st.sampled_from(ValueTypes), - unique=st.just(False), # st.booleans(), + value_dtype=st.sampled_from(ValueTypes), + unique=st.booleans(), shape=st.tuples( - st.integers(min_value=0, max_value=1024), - st.integers(min_value=0, max_value=1024), + st.integers(min_value=0, max_value=2**16), + st.integers(min_value=0, max_value=2**16), ), context=st.from_type(soma.SOMATileDBContext), ) @@ -187,8 +195,7 @@ def test_fastercsx_clib_compress_coo( context.native_context, shape, i, j, d, indptr, indices, data ) - # check with the oracle. Be careful if dups allowed, as summing dups - # in floats will be _approximately_ equal, not exactly equal + # compare to oracle csr = sparse.csr_matrix( (data, indices, indptr), shape=shape, dtype=value_dtype, copy=False ) @@ -209,8 +216,8 @@ def test_fastercsx_clib_compress_coo( csr.data, scipy_csr.data, equal_nan=True if value_dtype.kind == "f" else False, - atol=1e-07, - rtol=1e-05, + atol=1e-06, + rtol=1e-04, ) if not unique else np.array_equal( @@ -311,8 +318,8 @@ def test_fuzz_fastercsx_clib_copy_csx_to_dense( value_dtype=st.sampled_from(ValueTypes), unique=st.booleans(), shape=st.tuples( - st.integers(min_value=0, max_value=1024), - st.integers(min_value=0, max_value=1024), + st.integers(min_value=0, max_value=2**16), + st.integers(min_value=0, max_value=2**16), ), make_sorted=st.booleans(), format=st.sampled_from(["csc", "csr"]), @@ -353,8 +360,8 @@ def test_fastercsx_from_ijd( cm.data, scipy_cm.data, equal_nan=True if value_dtype.kind == "f" else False, - atol=1e-07, - rtol=1e-05, + atol=1e-06, + rtol=1e-04, ) if not unique else np.array_equal( @@ -368,8 +375,8 @@ def test_fastercsx_from_ijd( value_dtype=st.sampled_from(ValueTypes), unique=st.booleans(), shape=st.tuples( - st.integers(min_value=0, max_value=1024), - st.integers(min_value=0, max_value=1024), + st.integers(min_value=0, max_value=2**16), + st.integers(min_value=0, max_value=2**16), ), make_sorted=st.booleans(), format=st.sampled_from(["csc", "csr"]), @@ -417,8 +424,8 @@ def test_fastercsx_to_scipy( cm_slc.data, scipy_slc.data, equal_nan=True if value_dtype.kind == "f" else False, - atol=1e-07, - rtol=1e-05, + atol=1e-06, + rtol=1e-04, ) if not unique else np.array_equal( From a336bfa6db26059503d179d3938a535ea72f605b Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Jan 2025 16:57:55 -0800 Subject: [PATCH 09/20] add string/binary columns to dataframe tests --- apis/python/tests/ht/_ht_test_config.py | 4 + apis/python/tests/ht/_ht_util.py | 120 ++++++++++++++++++---- apis/python/tests/ht/conftest.py | 2 +- apis/python/tests/ht/test_ht_dataframe.py | 101 +++++++++++++----- 4 files changed, 179 insertions(+), 48 deletions(-) diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index 2f5210dca4..4af962e509 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -28,6 +28,10 @@ "sc-61743_workaround": True, # Read of new array returns incorrect info "sc-61676_workaround": True, + # index columns of type binary/large_binary are reported as large_string + "sc-62236_workaround": True, + # string index values starting with 0x7F barf + "sc-62265_workaround": True, # # Enable/disable partially implemented features # diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py index 226820e213..55a305353d 100644 --- a/apis/python/tests/ht/_ht_util.py +++ b/apis/python/tests/ht/_ht_util.py @@ -10,6 +10,7 @@ import numpy.typing as npt import pandas as pd import pyarrow as pa +from hypothesis import note from hypothesis import strategies as st from more_itertools import pairwise from packaging.version import Version @@ -194,16 +195,30 @@ def arrow_datatypes(draw: st.DrawFn) -> pa.DataType: def ndarray_datatype() -> st.SearchStrategy: + """Return a type that can be stored in a SOMA NDArray.""" return st.from_type(pa.DataType).filter( lambda t: ( pa.types.is_primitive(t) and not (pa.types.is_timestamp(t) and t.tz is not None) and not pa.types.is_time(t) and not pa.types.is_date(t) - and t - not in [ - pa.float16(), - ] + and t not in [pa.float16()] + ) + ) + + +def dataframe_datatype() -> st.SearchStrategy: + """Return type that can be stored in a DataFrame column.""" + return st.from_type(pa.DataType).filter( + lambda t: ( + ( + pa.types.is_primitive(t) + or t in [pa.string(), pa.large_string(), pa.binary(), pa.large_binary()] + ) + and not (pa.types.is_timestamp(t) and t.tz is not None) + and not pa.types.is_time(t) + and not pa.types.is_date(t) + and t not in [pa.float16()] ) ) @@ -539,16 +554,45 @@ def contiguous_slices(draw: Any, size: int) -> slice: return slice(start, stop + 1 if stop is not None else stop, step) -def schemas_equal(s1: pa.Schema, s2: pa.Schema, ignore_field_order=False) -> bool: - """NB: assumes all field names are unique! Raises if not.""" - if not ignore_field_order: - return s1 == s2 - else: - if len(s2) != len(s1): +def schemas_equal( + s1: pa.Schema, + s2: pa.Schema, + *, + ignore_field_order=False, + large_type_equivalence=False, +) -> bool: + """NB: assumes all field names are unique! Raises if not. + + Compares schema, returns true if "equal" - defined as: + * string/binary can be upcast to large equivalent + * ignore_field_order option + """ + + def _to_large_type_equivalent(f: pa.Field): + if pa.types.is_string(f.type): + return f.with_type(pa.large_string()) + if pa.types.is_binary(f.type): + return f.with_type(pa.large_binary()) + return f + + s1_names = sorted(s1.names) if ignore_field_order else s1.names + s2_names = sorted(s2.names) if ignore_field_order else s2.names + if s1_names != s2_names: + return False + if len(s1) != len(s2): + return False + + for field_name in s1.names: + f1 = s1.field(field_name) + f2 = s2.field(field_name) + if large_type_equivalence: + f1 = _to_large_type_equivalent(f1) + f2 = _to_large_type_equivalent(f2) + + if f1 != f2: return False - return all( - s1.field(field_name) == s2.field(field_name) for field_name in s1.names - ) + + return True def arrays_equal( @@ -558,17 +602,28 @@ def arrays_equal( # TODO: handle nullable arrays - if (read.type != expected.type) or (len(read) != len(expected)): + if read.type != expected.type: + note("arrays_equal: types not eq {read.type} != {expected.type}") + return False + + if len(read) != len(expected): + note(f"arrays_equal: length not eq {len(read)} != {len(expected)}") return False if not pa.types.is_floating(expected.type): - return expected.equals(read) + is_eq = expected.equals(read) + if not is_eq: + note("arrays_equal: contents not eq (non-float)") + else: + # Floating point path, to allow for NaN. Implemented with NumPy for convenience only + is_eq = all( + np.array_equal(r.to_numpy(), e.to_numpy(), equal_nan=equal_nan) + for r, e in zip(read.chunks, expected.chunks) + ) + if not is_eq: + note("arrays_equal: contents not eq (float)") - # Floating point path, to allow for NaN. Implemented with NumPy for convenience only - return all( - np.array_equal(r.to_numpy(), e.to_numpy(), equal_nan=equal_nan) - for r, e in zip(read.chunks, expected.chunks) - ) + return is_eq def tables_equal( @@ -583,6 +638,7 @@ def tables_equal( # checking field order and length up front simplifies code below if [f.name for f in read_schema] != [f.name for f in expected_schema]: + note(f"tables_equal: field names not eq: {read_schema} != {expected_schema}") return False if HT_TEST_CONFIG["sc-61222_workaround"]: @@ -602,12 +658,34 @@ def tables_equal( fidx, read_schema.field(fidx).with_type(field.type) ) - if (read_schema != expected_schema) or len(read) != len(expected): + def _upcast_to_large(schema: pa.Schema) -> pa.Schema: + for fidx, field in enumerate(schema): + if pa.types.is_string(field.type): + schema = schema.set( + fidx, schema.field(fidx).with_type(pa.large_string()) + ) + if pa.types.is_binary(field.type): + schema = schema.set( + fidx, schema.field(fidx).with_type(pa.large_binary()) + ) + return schema + + # TileDB upcasts variable length types to large - so treat as equivalent + read_schema = _upcast_to_large(read_schema) + expected_schema = _upcast_to_large(expected_schema) + if not schemas_equal(read_schema, expected_schema, large_type_equivalence=True): + note(f"tables_equal: not eq: {read_schema} != {expected_schema}") + return False + + if len(read) != len(expected): + note(f"tables_equal: length not eq: {len(read)} != {len(expected)}") return False expected = expected.cast(expected_schema) read = read.cast(read_schema) is_eq = all(arrays_equal(r, e, equal_nan=equal_nan) for r, e in zip(read, expected)) + if not is_eq: + note(f"tables_equal: contents not eq: {read} != {expected}") return is_eq diff --git a/apis/python/tests/ht/conftest.py b/apis/python/tests/ht/conftest.py index 5d34a2ac6c..d8ddfdf8cb 100644 --- a/apis/python/tests/ht/conftest.py +++ b/apis/python/tests/ht/conftest.py @@ -66,4 +66,4 @@ def context(concurrency: int | None) -> soma.SOMATileDBContext: # Register hypothesis profile for extensive/expensive test runs -ht.settings.register_profile("expensive", max_examples=10000) +ht.settings.register_profile("expensive", max_examples=10000, print_blob=True) diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index a968a970ee..d511dc8115 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -8,7 +8,7 @@ import pandas as pd import pyarrow as pa import pytest -from hypothesis import strategies as st +from hypothesis import strategies as st, reproduce_failure from hypothesis.extra import numpy as ht_np from hypothesis.extra import pandas as ht_pd from hypothesis.stateful import initialize, invariant, precondition, rule @@ -21,6 +21,7 @@ from tests.ht._ht_test_config import HT_TEST_CONFIG from tests.ht._ht_util import ( arrow_schema, + dataframe_datatype, df_to_table, from_datatype, pad_array, @@ -42,12 +43,16 @@ pa.uint64(), pa.float32(), pa.float64(), - pa.binary(), - pa.large_binary(), pa.string(), pa.large_string(), pa.timestamp("ns"), ] +if not HT_TEST_CONFIG["sc-62236_workaround"]: + DataFrameIndexTypes += [ + pa.binary(), + pa.large_binary(), + ] + if Version(pd.__version__) >= Version("2.0.0"): DataFrameIndexTypes += [ pa.timestamp("s"), @@ -75,18 +80,7 @@ def dataframe_schema(draw: st.DrawFn) -> tuple[Sequence[str], pa.Schema]: arrow_schema( required_fields=(pa.field("soma_joinid", pa.int64(), nullable=False),), unique_field_names=True, - elements=st.from_type(pa.DataType).filter( - lambda t: ( - pa.types.is_primitive(t) - and not (pa.types.is_timestamp(t) and t.tz is not None) - and not pa.types.is_time(t) - and not pa.types.is_date(t) - and t - not in [ - pa.float16(), - ] - ) - ), + elements=dataframe_datatype(), ) ) assert len(schema) > 1 @@ -181,6 +175,7 @@ def dataframe_domain( index_column_names: Sequence[str], max_domain: Domain | None = None, current_domain: Domain | None = None, + apply_defaults: bool = False, ) -> Domain: """Strategy to generate DataFrame domains. @@ -200,9 +195,7 @@ def dataframe_domain( new_domain = [] for field_index, field_name in enumerate(index_column_names): field = schema.field(field_name) - if not pa.types.is_primitive(field.type): - new_domain.append(None) # i.e., noop, use default - else: + if pa.types.is_primitive(field.type): zero = ( np.datetime64(0, field.type.unit) if pa.types.is_timestamp(field.type) @@ -264,6 +257,10 @@ def dataframe_domain( assert max_upper >= upper >= current_upper new_domain.append((lower, upper)) + else: + # no idea what this is, so specify default + new_domain.append(None) + assert len(new_domain) == len(index_column_names) return tuple(new_domain) @@ -282,17 +279,24 @@ def arrow_table( * have unique values in the index columns * have values within the domain for the index columns """ - index_domains = {k: v for k, v in zip(index_column_names, domain)} + index_domains = { + k: v if v is not None else (None, None) + for k, v in zip(index_column_names, domain) + } columns = [] for field in schema: name = field.name - dtype = np.dtype(field.type.to_pandas_dtype()) unique = name in index_column_names or name == "soma_joinid" elements = None min_value, max_value = index_domains.get(name, (None, None)) assert name in index_domains or (min_value is None and max_value is None) + # special case - limit even if it isn't an index + if name == "soma_joinid" and min_value is None: + min_value = 0 + max_value = 2**56 - 1 + if pa.types.is_timestamp(field.type): # don't generate NaT. ht_np.from_dtype doesn't obey min/max value # params, so draw ints, and then convert. NEB-7 says NaT is -2**63. @@ -306,6 +310,7 @@ def arrow_table( if max_value is None else min(2**63 - 1, int(max_value.astype(np.int64))) ) + dtype = np.dtype(field.type.to_pandas_dtype()) elements = st.builds( dtype.type, st.integers(min_value=min_value, max_value=max_value), @@ -313,6 +318,7 @@ def arrow_table( ) elif pa.types.is_primitive(field.type): + dtype = np.dtype(field.type.to_pandas_dtype()) elements = ht_np.from_dtype(dtype, min_value=min_value, max_value=max_value) # Array dimensions do not de-dup -0. and 0. as the same. Disable any generation # of negative zero until this is resolved. NB: ledger de-dup treats them a equivalent @@ -322,7 +328,28 @@ def arrow_table( ): elements = elements.filter(lambda x: not (x == 0 and np.signbit(x))) - # else, use default + elif field.type in [pa.string(), pa.large_string()]: + dtype = np.dtype(str) + if name in index_column_names: + # TileDB string index columns are restricted to "ASCII", and in + # actuality to [0,128). These tests use Pandas indexing, which are + # foobared on anything containing a null. So in practice, use [1,126] + if HT_TEST_CONFIG["sc-62265_workaround"]: + elements = st.text( + alphabet=st.characters(codec="ascii", min_codepoint=1, max_codepoint=126) + ) + else: + elements = st.text(alphabet=st.characters(codec="ascii", min_codepoint=1)) + else: + # Disallow surrogate codepoints. Arrow doesn't implement them in the + # encoder/decoder, and will throw if they are present. + elements = st.text(alphabet=st.characters(exclude_categories=["C"])) + + elif field.type in [pa.binary(), pa.large_binary()]: + dtype = np.dtype(bytes) + + else: # use default + dtype = np.dtype(field.type.to_pandas_dtype()) columns.append( ht_pd.column(name=name, dtype=dtype, unique=unique, elements=elements) @@ -395,8 +422,6 @@ def setup( self.domain = self.A.domain assert not self.A.closed assert self.A.mode == "w" - assert schemas_equal(self.schema, self.A.schema, ignore_field_order=True) - self.data_ledger = Ledger[ArrowTableLedgerEntry]( initial_entry=ArrowTableLedgerEntry( data=self.schema.empty_table(), @@ -428,7 +453,12 @@ def _array_open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None def check_schema(self) -> None: assert isinstance(self.A, soma.DataFrame) assert self.A.soma_type == "SOMADataFrame" - assert schemas_equal(self.schema, self.A.schema, ignore_field_order=True) + assert schemas_equal( + self.schema, + self.A.schema, + ignore_field_order=True, + large_type_equivalence=True, + ) assert sorted(self.schema.names) == sorted(self.A.keys()) assert self.index_column_names == self.A.index_column_names @@ -439,8 +469,26 @@ def check_schema(self) -> None: @precondition(lambda self: not self.closed) @invariant() def check_domain(self) -> None: - assert ( - self.A.domain == self.domain + domain = [] + for iname, idomain in zip(self.index_column_names, self.domain): + if idomain is not None: + domain.append(idomain) + else: + type = self.schema.field(iname).type + if type in [ + pa.string(), + pa.large_string(), + pa.binary(), + pa.large_binary(), + ]: + domain.append(("", "")) + elif pa.type.is_primitive(type): + domain.append((0, 0)) + else: + domain.append(None) + + assert self.A.domain == tuple( + domain ), f"Unexpected domain in {self.A}: had {self.A.domain}, expected {self.domain}" @precondition(lambda self: self.closed or self.mode == "w") @@ -453,6 +501,7 @@ def expand_domain(self, data: st.DataObject) -> None: index_column_names=self.index_column_names, current_domain=self.domain, max_domain=self.A.maxdomain, + apply_defaults=True, ) ) if self.closed: From d94a07e663b01f76aab4ca0c7f867b150bc8622b Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Wed, 22 Jan 2025 17:01:37 -0800 Subject: [PATCH 10/20] lint --- apis/python/tests/ht/test_ht_dataframe.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index d511dc8115..4aab91fcab 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -8,7 +8,7 @@ import pandas as pd import pyarrow as pa import pytest -from hypothesis import strategies as st, reproduce_failure +from hypothesis import strategies as st from hypothesis.extra import numpy as ht_np from hypothesis.extra import pandas as ht_pd from hypothesis.stateful import initialize, invariant, precondition, rule @@ -336,10 +336,14 @@ def arrow_table( # foobared on anything containing a null. So in practice, use [1,126] if HT_TEST_CONFIG["sc-62265_workaround"]: elements = st.text( - alphabet=st.characters(codec="ascii", min_codepoint=1, max_codepoint=126) + alphabet=st.characters( + codec="ascii", min_codepoint=1, max_codepoint=126 + ) ) else: - elements = st.text(alphabet=st.characters(codec="ascii", min_codepoint=1)) + elements = st.text( + alphabet=st.characters(codec="ascii", min_codepoint=1) + ) else: # Disallow surrogate codepoints. Arrow doesn't implement them in the # encoder/decoder, and will throw if they are present. From 3c70e438b6ac6d2d3e3c9ced8a0b9e8b33051173 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Sat, 25 Jan 2025 21:18:06 -0800 Subject: [PATCH 11/20] add enum/dict --- apis/python/tests/ht/_arrow_util.py | 47 +++ apis/python/tests/ht/_ht_test_config.py | 6 + apis/python/tests/ht/_ht_util.py | 161 +++++--- apis/python/tests/ht/_ledger.py | 35 +- apis/python/tests/ht/test_ht_dataframe.py | 419 ++++++++++++++++----- apis/python/tests/ht/test_ht_io_anndata.py | 60 +++ 6 files changed, 582 insertions(+), 146 deletions(-) create mode 100644 apis/python/tests/ht/_arrow_util.py create mode 100644 apis/python/tests/ht/test_ht_io_anndata.py diff --git a/apis/python/tests/ht/_arrow_util.py b/apis/python/tests/ht/_arrow_util.py new file mode 100644 index 0000000000..9609a49bf7 --- /dev/null +++ b/apis/python/tests/ht/_arrow_util.py @@ -0,0 +1,47 @@ +""" Various utilities for dealing with Arrow data.""" + +from __future__ import annotations + +import pyarrow as pa + + +def combine_chunks(a: pa.ChunkedArray) -> pa.Array: + """Semantically identical to pa.ChunkedArray.combine_chunks, but handles the + `large_` types which are unimplemented by pyarrow. + """ + type = a.type + + if pa.types.is_large_string(type): + return a.cast(pa.string()).combine_chunks().cast(type) + + if pa.types.is_large_binary(type): + return a.cast(pa.binary()).combine_chunks().cast(type) + + if pa.types.is_dictionary(type): + if pa.types.is_large_string(type.value_type): + return ( + a.cast( + pa.dictionary( + index_type=type.index_type, + value_type=pa.string(), + ordered=type.ordered, + ) + ) + .combine_chunks() + .cast(type) + ) + + if pa.types.is_large_binary(type.value_type): + return ( + a.cast( + pa.dictionary( + index_type=type.index_type, + value_type=pa.binary(), + ordered=type.ordered, + ) + ) + .combine_chunks() + .cast(type) + ) + + return a.combine_chunks() diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index 4af962e509..b489b06ae9 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -32,6 +32,12 @@ "sc-62236_workaround": True, # string index values starting with 0x7F barf "sc-62265_workaround": True, + # dictionary of timestamps is not working + "sc-62364_workaround": True, + # string categories of value '' fail in write + "sc-62447_workaround": True, + # float categoricals fail with NaN + "sc-62449_workaround": True, # # Enable/disable partially implemented features # diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py index 55a305353d..c0efa83e65 100644 --- a/apis/python/tests/ht/_ht_util.py +++ b/apis/python/tests/ht/_ht_util.py @@ -15,6 +15,7 @@ from more_itertools import pairwise from packaging.version import Version +from tests.ht._arrow_util import combine_chunks from tests.ht._ht_test_config import HT_TEST_CONFIG Shape = tuple[int, ...] @@ -108,6 +109,7 @@ def tiledb_timestamps(from_future: bool = False): @st.composite def arrow_integer_datatypes(draw: st.DrawFn) -> pa.DataType: + """Strategy returns an arrow integer datatype.""" return draw(st.sampled_from((pa.int8(), pa.int16(), pa.int32(), pa.int64()))) @@ -171,7 +173,7 @@ def arrow_decimal_datatypes(draw: st.DrawFn) -> pa.DataType: @st.composite -def arrow_datatypes(draw: st.DrawFn) -> pa.DataType: +def arrow_nondict_datatypes(draw: st.DrawFn) -> pa.DataType: return draw( st.one_of( arrow_integer_datatypes(), @@ -194,6 +196,26 @@ def arrow_datatypes(draw: st.DrawFn) -> pa.DataType: ) +@st.composite +def arrow_dictionary_datatypes(draw: st.DraFn) -> pa.DataType: + index_type = draw( + st.one_of((arrow_integer_datatypes(), arrow_unsigned_integer_datatypes())) + ) + value_type = draw(arrow_nondict_datatypes()) + ordered = draw(st.booleans()) + return pa.dictionary(index_type=index_type, value_type=value_type, ordered=ordered) + + +@st.composite +def arrow_datatypes(draw: st.DrawFn) -> pa.DataType: + return draw( + st.one_of( + arrow_nondict_datatypes(), + arrow_dictionary_datatypes(), + ) + ) + + def ndarray_datatype() -> st.SearchStrategy: """Return a type that can be stored in a SOMA NDArray.""" return st.from_type(pa.DataType).filter( @@ -209,18 +231,39 @@ def ndarray_datatype() -> st.SearchStrategy: def dataframe_datatype() -> st.SearchStrategy: """Return type that can be stored in a DataFrame column.""" - return st.from_type(pa.DataType).filter( - lambda t: ( + + def is_dataframe_value_type(dt: pa.DataType) -> bool: + return ( ( - pa.types.is_primitive(t) - or t in [pa.string(), pa.large_string(), pa.binary(), pa.large_binary()] + pa.types.is_primitive(dt) + or dt + in [pa.string(), pa.large_string(), pa.binary(), pa.large_binary()] ) - and not (pa.types.is_timestamp(t) and t.tz is not None) - and not pa.types.is_time(t) - and not pa.types.is_date(t) - and t not in [pa.float16()] + and not (pa.types.is_timestamp(dt) and dt.tz is not None) + and not pa.types.is_time(dt) + and not pa.types.is_date(dt) + and dt not in [pa.float16()] ) - ) + + def is_dataframe_column_type(dt: pa.DataType) -> bool: + if is_dataframe_value_type(dt): + return True + + if pa.types.is_dictionary(dt): + # Arrow can't convert unsigned index types into Pandas + if pa.types.is_unsigned_integer(dt.index_type): + return False + + if HT_TEST_CONFIG["sc-62364_workaround"] and pa.types.is_timestamp( + dt.value_type + ): + return False + + return is_dataframe_value_type(dt.value_type) + + return False + + return st.from_type(pa.DataType).filter(is_dataframe_column_type) @st.composite @@ -554,6 +597,36 @@ def contiguous_slices(draw: Any, size: int) -> slice: return slice(start, stop + 1 if stop is not None else stop, step) +def field_to_large_type_equivalent(f: pa.Field) -> pa.Field: + """Upcast string and binary to large equivalents.""" + + if pa.types.is_dictionary(f.type): + if pa.types.is_string(f.type.value_type): + return f.with_type( + pa.dictionary( + index_type=f.type.index_type, + value_type=pa.large_string(), + ordered=f.type.ordered, + ) + ) + elif pa.types.is_binary(f.type.value_type): + return f.with_type( + pa.dictionary( + index_type=f.type.index_type, + value_type=pa.large_binary(), + ordered=f.type.ordered, + ) + ) + else: + return f + elif pa.types.is_string(f.type): + return f.with_type(pa.large_string()) + elif pa.types.is_binary(f.type): + return f.with_type(pa.large_binary()) + else: + return f + + def schemas_equal( s1: pa.Schema, s2: pa.Schema, @@ -567,29 +640,24 @@ def schemas_equal( * string/binary can be upcast to large equivalent * ignore_field_order option """ - - def _to_large_type_equivalent(f: pa.Field): - if pa.types.is_string(f.type): - return f.with_type(pa.large_string()) - if pa.types.is_binary(f.type): - return f.with_type(pa.large_binary()) - return f - s1_names = sorted(s1.names) if ignore_field_order else s1.names s2_names = sorted(s2.names) if ignore_field_order else s2.names if s1_names != s2_names: + note(f"Schema names not eq, {s1_names} != {s2_names}") return False if len(s1) != len(s2): + note(f"Schema length not eq, {len(s1)} != {len(s2)}") return False for field_name in s1.names: f1 = s1.field(field_name) f2 = s2.field(field_name) if large_type_equivalence: - f1 = _to_large_type_equivalent(f1) - f2 = _to_large_type_equivalent(f2) + f1 = field_to_large_type_equivalent(f1) + f2 = field_to_large_type_equivalent(f2) if f1 != f2: + note(f"Schema fields not eq, {f1} != {f2}") return False return True @@ -603,18 +671,14 @@ def arrays_equal( # TODO: handle nullable arrays if read.type != expected.type: - note("arrays_equal: types not eq {read.type} != {expected.type}") + note(f"arrays_equal: types not eq {read.type} != {expected.type}") return False if len(read) != len(expected): note(f"arrays_equal: length not eq {len(read)} != {len(expected)}") return False - if not pa.types.is_floating(expected.type): - is_eq = expected.equals(read) - if not is_eq: - note("arrays_equal: contents not eq (non-float)") - else: + if pa.types.is_floating(expected.type): # Floating point path, to allow for NaN. Implemented with NumPy for convenience only is_eq = all( np.array_equal(r.to_numpy(), e.to_numpy(), equal_nan=equal_nan) @@ -623,6 +687,23 @@ def arrays_equal( if not is_eq: note("arrays_equal: contents not eq (float)") + elif pa.types.is_dictionary(expected.type): + # weak equivalence for dictionary encoded arrays. Just check that values, + # regardless of dictionary, are equal. + is_eq = pa.compute.all( + pa.compute.equal( + combine_chunks(read).dictionary_decode(), + combine_chunks(expected).dictionary_decode(), + ) + ) + if not is_eq: + note("arrays_equal: dictionary arrays not equal") + + else: + is_eq = expected.equals(read) + if not is_eq: + note("arrays_equal: contents not eq (non-float)") + return is_eq @@ -660,14 +741,10 @@ def tables_equal( def _upcast_to_large(schema: pa.Schema) -> pa.Schema: for fidx, field in enumerate(schema): - if pa.types.is_string(field.type): - schema = schema.set( - fidx, schema.field(fidx).with_type(pa.large_string()) - ) - if pa.types.is_binary(field.type): - schema = schema.set( - fidx, schema.field(fidx).with_type(pa.large_binary()) - ) + f_large = field_to_large_type_equivalent(field) + if f_large != field: + schema = schema.set(fidx, f_large) + return schema # TileDB upcasts variable length types to large - so treat as equivalent @@ -689,22 +766,6 @@ def _upcast_to_large(schema: pa.Schema) -> pa.Schema: return is_eq -def df_to_table(df: pd.DataFrame, *, schema: pa.Schema | None = None) -> pa.Table: - - # Table.from_pandas attempts to infer nulled values (e.g., NaN->null, NaT->null). - # We often do not want this behavior, so explicitly override it with `from_pandas=False` - # paramter of pa.array(). - - # NB: this doesn't work with NaT/timestamp64. We could pass a `mask` param to `to_numpy`, - # but NaT is such a strange beast, leaving as is for now. - - schema = pa.Schema.from_pandas(df).remove_metadata() if schema is None else schema - tbl = pa.Table.from_pydict( - {k: pa.array(v, from_pandas=False) for k, v in df.items()}, schema=schema - ) - return tbl - - def posix_filename() -> st.SearchStrategy: return st.text( alphabet=st.characters( diff --git a/apis/python/tests/ht/_ledger.py b/apis/python/tests/ht/_ledger.py index 9a49a20b2a..ded0f4b4a9 100644 --- a/apis/python/tests/ht/_ledger.py +++ b/apis/python/tests/ht/_ledger.py @@ -217,10 +217,21 @@ def combine_first(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame: NB: the two dataframes MUST have the same structure, and we aren't too careful about checking for that. + + NB: there is a modest attempt to combine categories for categorical/dictionary + types, but there are corner cases where the "right" answer is not obvious (e.g., + conflicting ordering of ordered categories). """ assert first.columns.equals(second.columns) - assert first.dtypes.equals(second.dtypes) + assert len(first.dtypes) == len(second.dtypes) + for (_1, s1), (_2, s2) in zip(first.items(), second.items()): + if s1.dtype == "category": + assert s2.dtype == "category" + assert s1.cat.ordered == s2.cat.ordered + assert s1.cat.categories.dtype == s2.cat.categories.dtype + else: + assert s1.dtype == s2.dtype assert first.index.nlevels == second.index.nlevels new_index = first.index.union(second.index) @@ -238,8 +249,26 @@ def combine_first(first: pd.DataFrame, second: pd.DataFrame) -> pd.DataFrame: if first_series.dtype.kind == "M" and second_series.dtype.kind == "M": second_series = pd.to_datetime(second_series) - combined_series = pd.concat([first_series, second_series]) - combined_series = combined_series.reindex(new_index, copy=False) + if not len(first_series): + combined_series = second_series + elif not len(second_series): + combined_series = first_series + else: + combined_series = pd.concat([first_series, second_series]) + combined_series = combined_series.reindex(new_index, copy=False) + + # Pandas concat will drop categorical information. Re-categoricalize. + if first_series.dtype == "category": + assert first_series.cat.ordered == second_series.cat.ordered + merged_categories = list( + dict.fromkeys(first_series.cat.categories) + | dict.fromkeys(second_series.cat.categories) + ) + combined_series = combined_series.astype( + pd.CategoricalDtype( + categories=merged_categories, ordered=first_series.cat.ordered + ) + ) new_data[col] = combined_series diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index 4aab91fcab..1132c86d9d 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -2,8 +2,9 @@ from __future__ import annotations -from typing import Any, Sequence, Union +from typing import Any, Generic, Sequence, TypeVar, Union +import attrs import numpy as np import pandas as pd import pyarrow as pa @@ -20,9 +21,10 @@ from tests.ht._array_state_machine import SOMAArrayStateMachine from tests.ht._ht_test_config import HT_TEST_CONFIG from tests.ht._ht_util import ( + arrow_array, arrow_schema, dataframe_datatype, - df_to_table, + # df_to_table, from_datatype, pad_array, schemas_equal, @@ -64,8 +66,55 @@ Domain = Sequence[AxisDomain] +T = TypeVar("T") + + +@attrs.define(kw_only=True, frozen=True) +class EnumerationMetadata(Generic[T]): + type: pa.DictionaryType + max_categories: int = attrs.field(init=False) + categories: tuple[T] = attrs.field(factory=tuple) + + def __attrs_post_init__(self): + # we are frozen, so use __setattr__ to bypass. + max_categories = np.iinfo(self.type.index_type.to_pandas_dtype()).max + + # catch the corner case where the cardinality of the value type + # is smaller than the index type. + if pa.types.is_integer(self.type.value_type): + max_categories = min( + max_categories, np.iinfo(self.type.value_type.to_pandas_dtype()).max + ) + object.__setattr__(self, "max_categories", max_categories) + + @property + def ordered(self) -> bool: + return self.type.ordered != 0 + + @property + def index_type(self) -> pa.DataType: + return self.type.index_type + + @property + def value_type(self) -> pa.DataType: + return self.type.value_type + + @property + def num_categories(self) -> int: + return len(self.categories) + + def extend_categories( + self, additional_categories: Sequence[T] + ) -> EnumerationMetadata[T]: + return attrs.evolve( + self, categories=tuple(list(self.categories) + list(additional_categories)) + ) + + @st.composite -def dataframe_schema(draw: st.DrawFn) -> tuple[Sequence[str], pa.Schema]: +def dataframe_schema( + draw: st.DrawFn, +) -> tuple[tuple[str], pa.Schema, dict[str, EnumerationMetadata[Any]]]: """Strategy will generate a legal DataFrame schema and accompanying index names. Will comply with SOMA/TileDB conventions: @@ -123,11 +172,20 @@ def dataframe_schema(draw: st.DrawFn) -> tuple[Sequence[str], pa.Schema]: ] schema = pa.schema(reordered_fields) + # define enumerations metadata + enumeration_metadata: dict[str, EnumerationMetadata[Any]] = {} + for field_idx, field in enumerate(schema): + if field.name in index_column_names: + continue + if not pa.types.is_dictionary(field.type): + continue + enumeration_metadata[field.name] = EnumerationMetadata(type=field.type) + assert len(schema) > 1 assert len(index_column_names) > 0 assert len(index_column_names) < len(schema) - return index_column_names, schema + return index_column_names, schema, enumeration_metadata def default_max_domain(datatype: pa.DataType) -> AxisDomain: @@ -266,107 +324,269 @@ def dataframe_domain( @st.composite -def arrow_table( +def column_values( + draw: st.DrawFn, + type: pa.DataType, + size: int, + is_index: bool, + unique: bool, + domain: tuple[int, int] | tuple[None, None], + is_dict_value: bool = False, # only used for bug workarounds +) -> pa.Array: + + min_value, max_value = domain + + if pa.types.is_timestamp(type): + # don't generate NaT. ht_np.from_dtype doesn't obey min/max value + # params, so draw ints, and then convert. NEB-7 says NaT is -2**63. + min_value = ( + -(2**63) + 1 + if min_value is None + else max(-(2**63) + 1, int(min_value.astype(np.int64))) + ) + max_value = ( + 2**63 - 1 + if max_value is None + else min(2**63 - 1, int(max_value.astype(np.int64))) + ) + dtype = np.dtype(type.to_pandas_dtype()) + elements = st.builds( + dtype.type, + st.integers(min_value=min_value, max_value=max_value), + st.just(type.unit), + ) + return draw( + arrow_array(type, size, elements=elements, unique=unique, padding=False) + ) + + elif pa.types.is_floating(type) and ( + HT_TEST_CONFIG["sc-61506_workaround"] or HT_TEST_CONFIG["sc-62449_workaround"] + ): + dtype = np.dtype(type.to_pandas_dtype()) + elements = ht_np.from_dtype(dtype, min_value=min_value, max_value=max_value) + if HT_TEST_CONFIG["sc-61506_workaround"]: + # Array dimensions do not de-dup -0. and 0. as the same. Disable any generation + # of negative zero until this is resolved. NB: ledger de-dup treats them a equivalent + # per IEEE 754 semantics. + elements = elements.filter(lambda x: not (x == 0 and np.signbit(x))) + + if HT_TEST_CONFIG["sc-62449_workaround"] and is_dict_value: + # NaN as categorical values fails to evolve the enum correctly, do disable NaN values + # ONLY when generating cat values. + elements = elements.filter(lambda x: not np.isnan(x)) + + return draw( + arrow_array(type, size, elements=elements, unique=unique, padding=False) + ) + + elif pa.types.is_primitive(type): + dtype = np.dtype(type.to_pandas_dtype()) + elements = ht_np.from_dtype(dtype, min_value=min_value, max_value=max_value) + return draw( + arrow_array(type, size, elements=elements, unique=unique, padding=False) + ) + + elif type in [pa.binary(), pa.large_binary()]: + if HT_TEST_CONFIG["sc-62447_workaround"]: + return draw( + arrow_array( + np.dtype(bytes), + size, + elements=st.binary(min_size=1).filter(lambda b: b"\x00" not in b), + unique=unique, + padding=False, + ) + ) + else: + return draw( + arrow_array(np.dtype(bytes), size, unique=unique, padding=False) + ) + + elif type in [pa.string(), pa.large_string()]: + dtype = np.dtype(str) + if is_index: + # TileDB string index columns are restricted to "7 bit ASCII". These tests use + # Pandas indexing, which are foobared on anything containing a null. + # So in practice, use [1,127] + if HT_TEST_CONFIG["sc-62265_workaround"]: + elements = st.text( + alphabet=st.characters( + codec="ascii", min_codepoint=1, max_codepoint=126 + ) + ) + else: + elements = st.text( + alphabet=st.characters(codec="ascii", min_codepoint=1) + ) + else: + # Disallow surrogate codepoints. Arrow doesn't implement them in the + # encoder/decoder, and will throw if they are present. + if is_dict_value and HT_TEST_CONFIG["sc-62447_workaround"]: + # don't allow empty string due to sc-62447 + elements = st.text( + alphabet=st.characters(exclude_categories=["C"]), min_size=1 + ) + else: + elements = st.text(alphabet=st.characters(exclude_categories=["C"])) + + return draw( + arrow_array(dtype, size, elements=elements, unique=unique, padding=False) + ) + + assert False, f"Unknown type: no arrow_table strategy for this type {type}" + + +def setdiff(a: set[Any], b: set[Any]) -> set[Any]: + """Set diff (a-b) with nan equivalence.""" + + def wo_nan(s): + return {v for v in s if v == v} # v!=v means Nan + + a_wo_nan, b_wo_nan = wo_nan(a), wo_nan(b) + if a_wo_nan != a and b_wo_nan != b: + # both had a Nan, so diff the wo_nan sets + return a_wo_nan - b_wo_nan + elif a_wo_nan == a and b_wo_nan == b: + # neither had a NaN, so diff the original sets + return a - b + elif a_wo_nan != a and b_wo_nan == b: + # a had a NaN, b did not, diff the wo sets and add a NaN. + # this handles the case where set a had multiple (different) + # NaNs + return (a_wo_nan - b_wo_nan) | {np.nan} + else: + # b had a NaN, a did not, so just diff the wo sets + return a_wo_nan - b_wo_nan + + +@st.composite +def arrow_table2( draw: st.DrawFn, schema: pa.Schema, index_column_names: Sequence[str], + enumeration_metadata: dict[str, EnumerationMetadata[Any]], domain: Domain, *, - min_size: int | None = None, -) -> pa.Table: - """Strategy to generate Arrow Tables which: - * match the schema - * have unique values in the index columns - * have values within the domain for the index columns - """ + min_size: int = 0, +) -> tuple[pa.Table, dict[str, EnumerationMetadata[Any]]]: + index_domains = { k: v if v is not None else (None, None) for k, v in zip(index_column_names, domain) } - columns = [] - for field in schema: - name = field.name - unique = name in index_column_names or name == "soma_joinid" - elements = None - - min_value, max_value = index_domains.get(name, (None, None)) - assert name in index_domains or (min_value is None and max_value is None) - - # special case - limit even if it isn't an index - if name == "soma_joinid" and min_value is None: - min_value = 0 - max_value = 2**56 - 1 - - if pa.types.is_timestamp(field.type): - # don't generate NaT. ht_np.from_dtype doesn't obey min/max value - # params, so draw ints, and then convert. NEB-7 says NaT is -2**63. - min_value = ( - -(2**63) + 1 - if min_value is None - else max(-(2**63) + 1, int(min_value.astype(np.int64))) - ) - max_value = ( - 2**63 - 1 - if max_value is None - else min(2**63 - 1, int(max_value.astype(np.int64))) - ) - dtype = np.dtype(field.type.to_pandas_dtype()) - elements = st.builds( - dtype.type, - st.integers(min_value=min_value, max_value=max_value), - st.just(field.type.unit), - ) + is_unique = { + f.name: (f.name in index_domains or f.name == "soma_joinid") for f in schema + } - elif pa.types.is_primitive(field.type): - dtype = np.dtype(field.type.to_pandas_dtype()) - elements = ht_np.from_dtype(dtype, min_value=min_value, max_value=max_value) - # Array dimensions do not de-dup -0. and 0. as the same. Disable any generation - # of negative zero until this is resolved. NB: ledger de-dup treats them a equivalent - # per IEEE 754 semantics. - if HT_TEST_CONFIG["sc-61506_workaround"] and pa.types.is_floating( - field.type + # First, decide if we have any dictionary/categoricals that we want to extend + for field in schema: + field_name = field.name + if pa.types.is_dictionary(field.type): + assert field_name not in index_domains + enmr = enumeration_metadata[field_name] + assert enmr.type == field.type + + # extend enum categories if it is len == 0 or draw says to do it + if enmr.num_categories == 0 or ( + (enmr.num_categories < enmr.max_categories) and draw(st.booleans()) ): - elements = elements.filter(lambda x: not (x == 0 and np.signbit(x))) - - elif field.type in [pa.string(), pa.large_string()]: - dtype = np.dtype(str) - if name in index_column_names: - # TileDB string index columns are restricted to "ASCII", and in - # actuality to [0,128). These tests use Pandas indexing, which are - # foobared on anything containing a null. So in practice, use [1,126] - if HT_TEST_CONFIG["sc-62265_workaround"]: - elements = st.text( - alphabet=st.characters( - codec="ascii", min_codepoint=1, max_codepoint=126 + MAX_CATEGORIES = 129 + new_cat_count = enmr.num_categories + draw( + st.integers( + min_value=1, + max_value=min( + MAX_CATEGORIES, enmr.max_categories - enmr.num_categories + ), + ) + ) + assert new_cat_count <= enmr.max_categories + + # draw until we have sufficient unique values + while enmr.num_categories < new_cat_count: + new_cats = draw( + column_values( + field.type.value_type, + new_cat_count - enmr.num_categories, + is_index=False, + unique=True, + domain=(None, None), + is_dict_value=True, ) ) - else: - elements = st.text( - alphabet=st.characters(codec="ascii", min_codepoint=1) + new_unique_cats = setdiff( + set(new_cats.to_pylist()), set(enmr.categories) ) - else: - # Disallow surrogate codepoints. Arrow doesn't implement them in the - # encoder/decoder, and will throw if they are present. - elements = st.text(alphabet=st.characters(exclude_categories=["C"])) + enmr = enmr.extend_categories(new_unique_cats) + + enumeration_metadata[field_name] = enmr + + # Second, calculate size of table based upon uniqueness requirements + def get_max_size() -> int: + """max_size is mininimum of: + * domain range of all int/uint/ts index domains + * number of categories for any column with a unique draw + """ + max_size = 1024 # default max + for f in schema: + if not is_unique[f.name]: + continue + if f.name in index_domains: + d = index_domains[f.name] + if pa.types.is_integer(f.type): + max_size = min(max_size, d[1] - d[0] + 1) + elif pa.types.is_floating(f.type): + max_size = int( + min( + max_size, + (d[1] - d[0]) / np.finfo(f.type.to_pandas_dtype()).tiny + 1, + ) + ) + elif pa.types.is_timestamp(f.type): + delta = int(d[1].astype(np.int64)) - int(d[0].astype(np.int64)) + assert delta >= 0 + max_size = min(max_size, delta + 1) + elif pa.types.is_dictionary(f.type): + max_size = min(max_size, enumeration_metadata[f.name].num_categories) - elif field.type in [pa.binary(), pa.large_binary()]: - dtype = np.dtype(bytes) + return max_size - else: # use default - dtype = np.dtype(field.type.to_pandas_dtype()) + size = draw(st.integers(min_value=min_size, max_value=get_max_size())) - columns.append( - ht_pd.column(name=name, dtype=dtype, unique=unique, elements=elements) - ) + # Third, draw table columns + columns = {} + for field in schema: + field_name = field.name + is_index = field_name in index_domains + + if pa.types.is_dictionary(field.type): + assert not is_index + enmr = enumeration_metadata[field_name] + + dictionary = pa.array(enmr.categories, type=field.type.value_type) + indices = draw( + ht_np.arrays( + dtype=field.type.index_type.to_pandas_dtype(), + shape=(size,), + unique=is_unique[field_name], + elements=st.integers( + min_value=0, max_value=enmr.num_categories - 1 + ), + ) + ) + columns[field_name] = pa.DictionaryArray.from_arrays( + indices, dictionary, ordered=field.type.ordered + ) + else: + domain = index_domains.get(field_name, (None, None)) + if field_name == "soma_joinid" and domain == (None, None): + domain = (0, 2**56 - 1) + columns[field_name] = draw( + column_values(field.type, size, is_index, is_unique[field_name], domain) + ) - df = draw( - ht_pd.data_frames(columns=columns, index=ht_pd.range_indexes(min_size=min_size)) - ) - assert min_size is None or len(df) >= min_size - tbl = df_to_table(df, schema=schema) - assert schemas_equal(schema, tbl.schema) - if len(tbl) == 0: - return tbl + assert all(len(columns[k]) == size for k in columns) + tbl = pa.Table.from_pydict(columns, schema) + assert tbl.schema == schema # split, sometimes if ( @@ -393,7 +613,7 @@ def arrow_table( ) tbl = pa.Table.from_batches(batches) - return tbl + return tbl, enumeration_metadata class SOMADataFrameStateMachine(SOMAArrayStateMachine): @@ -401,13 +621,20 @@ class SOMADataFrameStateMachine(SOMAArrayStateMachine): def __init__(self) -> None: super().__init__() - @initialize(data=st.data(), index_cols_and_schema=dataframe_schema()) + @initialize(data=st.data(), dataframe_schema=dataframe_schema()) def setup( self, data: st.DataObject, - index_cols_and_schema: tuple[Sequence[str], pa.Schema], + dataframe_schema: tuple[ + Sequence[str], pa.Schema, dict[str, EnumerationMetadata[Any]] + ], ) -> None: - self.index_column_names, self.schema = index_cols_and_schema + # Schema in total includes: arrow schema, index column names and current enumerations for + # any dictionary columns. These must be evolved as a unit, as there are lots of cross- + # dependencies (e.g, some types may not be an index column). + self.index_column_names, self.schema, self.enumeration_metadata = ( + dataframe_schema + ) self.domain = data.draw( # TODO XXX: should be a ledger dataframe_domain( schema=self.schema, index_column_names=self.index_column_names @@ -549,8 +776,14 @@ def check_count(self) -> None: ) # only one write per timestamp until sc-61223 and sc-61226 are fixed @rule(data=st.data()) def write(self, data: st.DataObject) -> None: - df_tbl = data.draw( - arrow_table(self.schema, self.index_column_names, self.domain, min_size=1) + df_tbl, self.enumeration_metadata = data.draw( + arrow_table2( + self.schema, + self.index_column_names, + self.enumeration_metadata, + self.domain, + min_size=1, + ) ) fragments_before_write = get_entries(f"{self.uri}/__fragments") self.A.write(df_tbl) diff --git a/apis/python/tests/ht/test_ht_io_anndata.py b/apis/python/tests/ht/test_ht_io_anndata.py new file mode 100644 index 0000000000..e5b4878e01 --- /dev/null +++ b/apis/python/tests/ht/test_ht_io_anndata.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import pathlib + +import anndata +import hypothesis as ht +from hypothesis import given, settings +from hypothesis import strategies as st +from hypothesis.extra import numpy as ht_np +from hypothesis.extra import pandas as ht_pd + +import tiledbsoma as soma +import tiledbsoma.io + +from tests.ht._ht_util import posix_filename + +# from_anndata(experiment_uri: 'str', anndata: 'ad.AnnData', measurement_name: 'str', *, context: 'SOMATileDBContext | None' = None, platform_config: 'PlatformConfig | None' = None, obs_id_name: 'str' = 'obs_id', var_id_name: 'str' = 'var_id', X_layer_name: 'str' = 'data', raw_X_layer_name: 'str' = 'data', ingest_mode: 'IngestMode' = 'write', use_relative_uri: 'bool | None' = None, X_kind: 'Union[Type[SparseNDArray], Type[DenseNDArray]]' = , registration_mapping: 'ExperimentAmbientLabelMapping | None' = None, uns_keys: 'Sequence[str] | None' = None, additional_metadata: 'AdditionalMetadata' = None) -> 'str' + + +@st.composite +def anndatas(draw: st.DrawFn) -> anndata.AnnData: + """ + is empty OK? + is empty obs/var OK? + etc + """ + + obs = draw( + ht_pd.data_frames(columns=columns, index=ht_pd.range_indexes(min_size=min_size)) + ) + + return anndata.AnnData() + + +@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) +@given( + data=st.data(), + adata=st.from_type(anndata.AnnData), + measurement_name=st.text(min_size=1), + X_layer_name=st.text(min_size=1), + raw_X_layer_name=st.text(min_size=1), +) +def test_roundtrip_from_anndata_to_anndata( + data: st.DataFn, + adata: anndata.AnnData, + measurement_name: str, + X_layer_name: str, + raw_X_layer_name: str, + tmp_path_factory, # fixure +) -> None: + + experiment_uri = tmp_path_factory.mktemp("anndata-").as_posix() + + print("from_anndata(") + print(adata) + print(experiment_uri) + print(measurement_name) + print(X_layer_name) + print(raw_X_layer_name) + print(")") From b3cdcf8d3d105233d5b2e2270bb0298f2391051c Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Sat, 25 Jan 2025 21:19:12 -0800 Subject: [PATCH 12/20] disable io --- apis/python/tests/ht/test_ht_dataframe.py | 1 - apis/python/tests/ht/test_ht_io_anndata.py | 112 ++++++++++----------- 2 files changed, 52 insertions(+), 61 deletions(-) diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index 1132c86d9d..ffee6dbad3 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -11,7 +11,6 @@ import pytest from hypothesis import strategies as st from hypothesis.extra import numpy as ht_np -from hypothesis.extra import pandas as ht_pd from hypothesis.stateful import initialize, invariant, precondition, rule from more_itertools import pairwise from packaging.version import Version diff --git a/apis/python/tests/ht/test_ht_io_anndata.py b/apis/python/tests/ht/test_ht_io_anndata.py index e5b4878e01..3ed3fec5f3 100644 --- a/apis/python/tests/ht/test_ht_io_anndata.py +++ b/apis/python/tests/ht/test_ht_io_anndata.py @@ -1,60 +1,52 @@ -from __future__ import annotations - -import pathlib - -import anndata -import hypothesis as ht -from hypothesis import given, settings -from hypothesis import strategies as st -from hypothesis.extra import numpy as ht_np -from hypothesis.extra import pandas as ht_pd - -import tiledbsoma as soma -import tiledbsoma.io - -from tests.ht._ht_util import posix_filename - -# from_anndata(experiment_uri: 'str', anndata: 'ad.AnnData', measurement_name: 'str', *, context: 'SOMATileDBContext | None' = None, platform_config: 'PlatformConfig | None' = None, obs_id_name: 'str' = 'obs_id', var_id_name: 'str' = 'var_id', X_layer_name: 'str' = 'data', raw_X_layer_name: 'str' = 'data', ingest_mode: 'IngestMode' = 'write', use_relative_uri: 'bool | None' = None, X_kind: 'Union[Type[SparseNDArray], Type[DenseNDArray]]' = , registration_mapping: 'ExperimentAmbientLabelMapping | None' = None, uns_keys: 'Sequence[str] | None' = None, additional_metadata: 'AdditionalMetadata' = None) -> 'str' - - -@st.composite -def anndatas(draw: st.DrawFn) -> anndata.AnnData: - """ - is empty OK? - is empty obs/var OK? - etc - """ - - obs = draw( - ht_pd.data_frames(columns=columns, index=ht_pd.range_indexes(min_size=min_size)) - ) - - return anndata.AnnData() - - -@settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) -@given( - data=st.data(), - adata=st.from_type(anndata.AnnData), - measurement_name=st.text(min_size=1), - X_layer_name=st.text(min_size=1), - raw_X_layer_name=st.text(min_size=1), -) -def test_roundtrip_from_anndata_to_anndata( - data: st.DataFn, - adata: anndata.AnnData, - measurement_name: str, - X_layer_name: str, - raw_X_layer_name: str, - tmp_path_factory, # fixure -) -> None: - - experiment_uri = tmp_path_factory.mktemp("anndata-").as_posix() - - print("from_anndata(") - print(adata) - print(experiment_uri) - print(measurement_name) - print(X_layer_name) - print(raw_X_layer_name) - print(")") +# from __future__ import annotations + +# import anndata +# import hypothesis as ht +# from hypothesis import given, settings +# from hypothesis import strategies as st +# from hypothesis.extra import pandas as ht_pd + +# # from_anndata(experiment_uri: 'str', anndata: 'ad.AnnData', measurement_name: 'str', *, context: 'SOMATileDBContext | None' = None, platform_config: 'PlatformConfig | None' = None, obs_id_name: 'str' = 'obs_id', var_id_name: 'str' = 'var_id', X_layer_name: 'str' = 'data', raw_X_layer_name: 'str' = 'data', ingest_mode: 'IngestMode' = 'write', use_relative_uri: 'bool | None' = None, X_kind: 'Union[Type[SparseNDArray], Type[DenseNDArray]]' = , registration_mapping: 'ExperimentAmbientLabelMapping | None' = None, uns_keys: 'Sequence[str] | None' = None, additional_metadata: 'AdditionalMetadata' = None) -> 'str' + + +# @st.composite +# def anndatas(draw: st.DrawFn) -> anndata.AnnData: +# """ +# is empty OK? +# is empty obs/var OK? +# etc +# """ + +# obs = draw( +# ht_pd.data_frames(columns=columns, index=ht_pd.range_indexes(min_size=min_size)) +# ) + +# return anndata.AnnData() + + +# @settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) +# @given( +# data=st.data(), +# adata=st.from_type(anndata.AnnData), +# measurement_name=st.text(min_size=1), +# X_layer_name=st.text(min_size=1), +# raw_X_layer_name=st.text(min_size=1), +# ) +# def test_roundtrip_from_anndata_to_anndata( +# data: st.DataFn, +# adata: anndata.AnnData, +# measurement_name: str, +# X_layer_name: str, +# raw_X_layer_name: str, +# tmp_path_factory, # fixure +# ) -> None: + +# experiment_uri = tmp_path_factory.mktemp("anndata-").as_posix() + +# print("from_anndata(") +# print(adata) +# print(experiment_uri) +# print(measurement_name) +# print(X_layer_name) +# print(raw_X_layer_name) +# print(")") From 006140a37313d63b92a6693e808344205681d82f Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Sun, 26 Jan 2025 14:32:32 -0800 Subject: [PATCH 13/20] fix padding required for arrow offset testing --- apis/python/tests/ht/_ht_test_config.py | 2 +- apis/python/tests/ht/_ht_util.py | 46 ++++++++++++++++++----- apis/python/tests/ht/test_ht_dataframe.py | 5 +-- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index b489b06ae9..4364a34ee7 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -7,7 +7,7 @@ # # data corruption due to incorrect Arrow array offset handling # See also sc-62104 - "sc-61239_workaround": True, + "sc-61239_workaround": False, # creating array with timestamp==0 fails in 1.15 (regression) "sc-61054_workaround": True, # Tables returned by SparseNDArray.read have incorrect nullability in schema fields diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py index c0efa83e65..828a65336e 100644 --- a/apis/python/tests/ht/_ht_util.py +++ b/apis/python/tests/ht/_ht_util.py @@ -342,25 +342,51 @@ def arrow_slice(draw: st.DrawFn, size: int) -> ArrowSlice: return (offset, length) -def pad_array(arr: npt.NDArray[Any], draw: st.DrawFn) -> pa.Array: - """Strategy helper: add padding to one or both ends of the array. This tests for Arrow array "offset" handling.""" +def pad_array(arr: pa.Array | npt.NDArray[Any], draw: st.DrawFn) -> pa.Array: + """Strategy helper: add padding to one or both ends of the array. This tests for Arrow array + offset & length handling.""" if HT_TEST_CONFIG.get("sc-61239_workaround", False): return pa.array(arr) + if not isinstance(arr, pa.Array): + arr = pa.array(arr) + head = draw(st.integers(min_value=0, max_value=16)) tail = draw(st.integers(min_value=0, max_value=16)) if not bool(head or tail): - return pa.array(arr) + return arr + + if pa.types.is_dictionary(arr.type): + padding = draw(st.integers(min_value=0, max_value=len(arr.dictionary) - 1)) + head_arr = pa.DictionaryArray.from_arrays( + indices=pa.array([padding] * head, type=arr.type.index_type), + dictionary=arr.dictionary, + ordered=arr.type.ordered, + ) + tail_arr = pa.DictionaryArray.from_arrays( + indices=pa.array([padding] * tail, type=arr.type.index_type), + dictionary=arr.dictionary, + ordered=arr.type.ordered, + ) + + else: + if pa.types.is_large_string(arr.type) or pa.types.is_string(arr.type): + pad_type = str + elif pa.types.is_large_binary(arr.type) or pa.types.is_binary(arr.type): + pad_type = bytes + elif pa.types.is_timestamp(arr.type): + pad_type = np.int64 + else: + pad_type = np.dtype(arr.type.to_pandas_dtype()).type - padding = draw(st.from_type(arr.dtype.type)) + padding = draw(st.from_type(pad_type)) + head_arr = pa.array([padding] * head).cast(arr.type) + tail_arr = pa.array([padding] * tail).cast(arr.type) - shape = (arr.shape[0] + head + tail, *arr.shape[1:]) - padded_arr = np.empty_like(arr, shape=shape) - padded_arr[0:head] = padding - padded_arr[head : head + len(arr)] = arr - padded_arr[head + len(arr) :] = padding - return pa.array(padded_arr)[head : head + len(arr)] + assert arr.type == head_arr.type == tail_arr.type + padded_arr = pa.chunked_array([head_arr, arr, tail_arr]).combine_chunks() + return padded_arr.slice(head, len(arr)) @st.composite diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index ffee6dbad3..288c3fa1a3 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -603,10 +603,7 @@ def get_max_size() -> int: if draw(st.booleans()) and not HT_TEST_CONFIG["sc-61239_workaround"]: batches = tbl.to_batches() batch_to_pad = draw(st.integers(min_value=0, max_value=len(batches) - 1)) - batch_arrays = [ - pad_array(arr.to_numpy(zero_copy_only=(arr.type != pa.bool_())), draw) - for arr in batches[batch_to_pad].columns - ] + batch_arrays = [pad_array(arr, draw) for arr in batches[batch_to_pad].columns] batches[batch_to_pad] = pa.RecordBatch.from_arrays( batch_arrays, schema=tbl.schema ) From 143737f108a58fcb3a4491e89d25672dbd92e4bc Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Mon, 27 Jan 2025 16:45:25 -0800 Subject: [PATCH 14/20] remove obsolete comment --- apis/python/tests/ht/test_ht_densendarray.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/apis/python/tests/ht/test_ht_densendarray.py b/apis/python/tests/ht/test_ht_densendarray.py index b4e7c52416..8dca2eefc6 100644 --- a/apis/python/tests/ht/test_ht_densendarray.py +++ b/apis/python/tests/ht/test_ht_densendarray.py @@ -209,8 +209,6 @@ def expand_shape(self, data: st.DataObject) -> None: ## --- data ## @precondition(lambda self: not self.closed and self.mode == "r") - # sc-61920 -- while the API accepts `auto` as a result_order, the read result - # is then nondeterministic. For now, don't do `auto` @rule(result_order=st.sampled_from(["row-major", "column-major"])) def check_read_all(self, result_order: str) -> None: tensor = self.A.read(result_order=result_order) From 67634d4950ed5eb400e05ccf2006c1f96a1fe127 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Tue, 28 Jan 2025 15:38:46 -0800 Subject: [PATCH 15/20] fix overflow warning --- apis/python/tests/ht/test_ht_dataframe.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index 288c3fa1a3..54fcbde04f 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -534,12 +534,13 @@ def get_max_size() -> int: if pa.types.is_integer(f.type): max_size = min(max_size, d[1] - d[0] + 1) elif pa.types.is_floating(f.type): - max_size = int( - min( - max_size, - (d[1] - d[0]) / np.finfo(f.type.to_pandas_dtype()).tiny + 1, + with np.errstate(over="ignore"): + max_size = int( + min( + max_size, + (d[1] - d[0]) / np.finfo(f.type.to_pandas_dtype()).tiny + 1, + ) ) - ) elif pa.types.is_timestamp(f.type): delta = int(d[1].astype(np.int64)) - int(d[0].astype(np.int64)) assert delta >= 0 From fa8303f77b95d1e0a890d718d6f414fefd8436f5 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 31 Jan 2025 17:16:07 -0800 Subject: [PATCH 16/20] update readme --- apis/python/tests/ht/README.md | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/apis/python/tests/ht/README.md b/apis/python/tests/ht/README.md index def3e892f0..01906d2002 100644 --- a/apis/python/tests/ht/README.md +++ b/apis/python/tests/ht/README.md @@ -1,20 +1,40 @@ # SOMA Hypothesis-based tests This folder contains Hypothesis-based tests and supporting code. All will run within the standard pytest -framework and will run in the course of normal pytest execution. +framework and will run in the course of normal pytest execution (including during CI). -## Configuration +## Recreating test failures + +Property-based tests generate unique test invocations each time they are run. As such, test failures in CI +may not immediately reproduce if you re-run a test. The test log files should contain enough +information to reproduce the error, including the random number seed used for test selection, and the +so-called "blob" that allows Hypothesis to recreate a particular test. + +The [Hypothesis documentation](https://hypothesis.readthedocs.io/) has information on reproducing +test failures. + +## Exhaustive testing -The default configuration is suitable for use in CI, i.e., run fairly quickly. Please do not +The more test cases Hypothesis generates, the more likely it is to find a bug. This is at odds with +the need for our CI pipeline to complete in a "reasonable" amount of time. + +The default configuration is suitable for use in CI, i.e., all tests will complete fairly quickly. Please do not change this behavior. In the course of development, it is often useful to more exhaustively search for test cases. A Hypothesis profile has been defined for this case called `expensive`. You can run the tests in this mode: -> pytest tests/ --hypothesis-profile=expensive +> pytest apis/python/tests/ --hypothesis-profile=expensive + +In this mode, tests will run significantly longer (very roughly, 100X longer than the default) and cover +many more test conditions. Because each invocation of the test starts with a unique random seed, you +can repeat this invocation until you are satisfied with your test coverage. + +## Configuration -In this mode, tests will run signicantly longer (very roughly, 100X longer than the default). +The `_ht_test_config.py` file is used to configure the tests. The most common use case is a config flag +which enables a defect work-around, while the issue is being resolved. ## For More Information From f3098e8fb9f32dd4707175178399da66b18f383e Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 31 Jan 2025 17:16:23 -0800 Subject: [PATCH 17/20] lint --- apis/python/tests/ht/test_ht_dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index 54fcbde04f..30429eb202 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -538,7 +538,8 @@ def get_max_size() -> int: max_size = int( min( max_size, - (d[1] - d[0]) / np.finfo(f.type.to_pandas_dtype()).tiny + 1, + (d[1] - d[0]) / np.finfo(f.type.to_pandas_dtype()).tiny + + 1, ) ) elif pa.types.is_timestamp(f.type): From 1d86e4c95cda5a83a58cf20352bfadb3fdfb392b Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 31 Jan 2025 17:18:01 -0800 Subject: [PATCH 18/20] remove sc-61239 work-around --- apis/python/tests/ht/_ht_test_config.py | 3 --- apis/python/tests/ht/_ht_util.py | 3 --- apis/python/tests/ht/test_ht_dataframe.py | 8 ++------ 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index 4364a34ee7..9aacac5903 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -5,9 +5,6 @@ # # Defect work-arounds, while awaiting a fix # - # data corruption due to incorrect Arrow array offset handling - # See also sc-62104 - "sc-61239_workaround": False, # creating array with timestamp==0 fails in 1.15 (regression) "sc-61054_workaround": True, # Tables returned by SparseNDArray.read have incorrect nullability in schema fields diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py index 828a65336e..ba6a157f52 100644 --- a/apis/python/tests/ht/_ht_util.py +++ b/apis/python/tests/ht/_ht_util.py @@ -346,9 +346,6 @@ def pad_array(arr: pa.Array | npt.NDArray[Any], draw: st.DrawFn) -> pa.Array: """Strategy helper: add padding to one or both ends of the array. This tests for Arrow array offset & length handling.""" - if HT_TEST_CONFIG.get("sc-61239_workaround", False): - return pa.array(arr) - if not isinstance(arr, pa.Array): arr = pa.array(arr) diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index 30429eb202..7ea904ee05 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -590,11 +590,7 @@ def get_max_size() -> int: assert tbl.schema == schema # split, sometimes - if ( - len(tbl) > 3 - and draw(st.booleans()) - and not HT_TEST_CONFIG["sc-61239_workaround"] - ): + if len(tbl) > 3 and draw(st.booleans()): n_splits = draw(st.integers(min_value=0, max_value=max(0, len(tbl) // 10))) if n_splits > 0: split_points = draw(splitss(n_splits=n_splits, max_value=len(tbl))) @@ -602,7 +598,7 @@ def get_max_size() -> int: tbl = pa.concat_tables([tbl[st:sp] for st, sp in pairwise(split_points)]) # pad, sometimes - if draw(st.booleans()) and not HT_TEST_CONFIG["sc-61239_workaround"]: + if draw(st.booleans()): batches = tbl.to_batches() batch_to_pad = draw(st.integers(min_value=0, max_value=len(batches) - 1)) batch_arrays = [pad_array(arr, draw) for arr in batches[batch_to_pad].columns] From 5ad9e59e9d47ee7922393802869aaaad5345d878 Mon Sep 17 00:00:00 2001 From: bkmartinjr Date: Fri, 31 Jan 2025 18:00:15 -0800 Subject: [PATCH 19/20] remove incomplete test stub --- apis/python/tests/ht/test_ht_io_anndata.py | 52 ---------------------- 1 file changed, 52 deletions(-) delete mode 100644 apis/python/tests/ht/test_ht_io_anndata.py diff --git a/apis/python/tests/ht/test_ht_io_anndata.py b/apis/python/tests/ht/test_ht_io_anndata.py deleted file mode 100644 index 3ed3fec5f3..0000000000 --- a/apis/python/tests/ht/test_ht_io_anndata.py +++ /dev/null @@ -1,52 +0,0 @@ -# from __future__ import annotations - -# import anndata -# import hypothesis as ht -# from hypothesis import given, settings -# from hypothesis import strategies as st -# from hypothesis.extra import pandas as ht_pd - -# # from_anndata(experiment_uri: 'str', anndata: 'ad.AnnData', measurement_name: 'str', *, context: 'SOMATileDBContext | None' = None, platform_config: 'PlatformConfig | None' = None, obs_id_name: 'str' = 'obs_id', var_id_name: 'str' = 'var_id', X_layer_name: 'str' = 'data', raw_X_layer_name: 'str' = 'data', ingest_mode: 'IngestMode' = 'write', use_relative_uri: 'bool | None' = None, X_kind: 'Union[Type[SparseNDArray], Type[DenseNDArray]]' = , registration_mapping: 'ExperimentAmbientLabelMapping | None' = None, uns_keys: 'Sequence[str] | None' = None, additional_metadata: 'AdditionalMetadata' = None) -> 'str' - - -# @st.composite -# def anndatas(draw: st.DrawFn) -> anndata.AnnData: -# """ -# is empty OK? -# is empty obs/var OK? -# etc -# """ - -# obs = draw( -# ht_pd.data_frames(columns=columns, index=ht_pd.range_indexes(min_size=min_size)) -# ) - -# return anndata.AnnData() - - -# @settings(suppress_health_check=(ht.HealthCheck.function_scoped_fixture,)) -# @given( -# data=st.data(), -# adata=st.from_type(anndata.AnnData), -# measurement_name=st.text(min_size=1), -# X_layer_name=st.text(min_size=1), -# raw_X_layer_name=st.text(min_size=1), -# ) -# def test_roundtrip_from_anndata_to_anndata( -# data: st.DataFn, -# adata: anndata.AnnData, -# measurement_name: str, -# X_layer_name: str, -# raw_X_layer_name: str, -# tmp_path_factory, # fixure -# ) -> None: - -# experiment_uri = tmp_path_factory.mktemp("anndata-").as_posix() - -# print("from_anndata(") -# print(adata) -# print(experiment_uri) -# print(measurement_name) -# print(X_layer_name) -# print(raw_X_layer_name) -# print(")") From 6a8c7466c244a541745a10f6983a1b62b504b12d Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Tue, 4 Feb 2025 11:37:42 -0500 Subject: [PATCH 20/20] #3560 review (#3663) * factor README link * use `mode: OpenMode` instead of `mode: str` * add missing `self` params * OpenMode * rm unused `SOMAArrayStateMachine._reopen` * `s/work-arounds/workarounds/` * add/tweak type annotations * `get_entries`: return `set[str]` * parameterize `Ledger` with a `LedgerEntryType` this allows the type system to understand that e.g. the return value of a `.read()` can be a `PyDictLedgerEntry`, which can then have `to_dict()` invoked * rm unused `concurrency` fixture * rm unused imports * avoid `st` shadowing --- apis/python/tests/ht/README.md | 10 ++++++---- apis/python/tests/ht/_array_state_machine.py | 17 +++++++---------- apis/python/tests/ht/_ht_test_config.py | 4 ++-- apis/python/tests/ht/_ht_util.py | 8 ++++---- apis/python/tests/ht/_ledger.py | 15 +++++++++------ apis/python/tests/ht/conftest.py | 14 ++------------ apis/python/tests/ht/test_ht_dataframe.py | 14 +++++++++----- apis/python/tests/ht/test_ht_densendarray.py | 14 +++++++------- apis/python/tests/ht/test_ht_sparsendarray.py | 17 +++++++++-------- 9 files changed, 55 insertions(+), 58 deletions(-) diff --git a/apis/python/tests/ht/README.md b/apis/python/tests/ht/README.md index 01906d2002..189a260676 100644 --- a/apis/python/tests/ht/README.md +++ b/apis/python/tests/ht/README.md @@ -10,8 +10,7 @@ may not immediately reproduce if you re-run a test. The test log files should co information to reproduce the error, including the random number seed used for test selection, and the so-called "blob" that allows Hypothesis to recreate a particular test. -The [Hypothesis documentation](https://hypothesis.readthedocs.io/) has information on reproducing -test failures. +The [Hypothesis documentation] has information on reproducing test failures. ## Exhaustive testing @@ -34,8 +33,11 @@ can repeat this invocation until you are satisfied with your test coverage. ## Configuration The `_ht_test_config.py` file is used to configure the tests. The most common use case is a config flag -which enables a defect work-around, while the issue is being resolved. +which enables a defect workaround, while the issue is being resolved. ## For More Information -See the [Hypothesis documentation](https://hypothesis.readthedocs.io/) +See the [Hypothesis documentation]. + + +[Hypothesis documentation]: https://hypothesis.readthedocs.io/ diff --git a/apis/python/tests/ht/_array_state_machine.py b/apis/python/tests/ht/_array_state_machine.py index 5340309612..4e636f8bcc 100644 --- a/apis/python/tests/ht/_array_state_machine.py +++ b/apis/python/tests/ht/_array_state_machine.py @@ -14,6 +14,7 @@ import pyarrow as pa from hypothesis import strategies as st from hypothesis.stateful import RuleBasedStateMachine, invariant, precondition, rule +from somacore.options import OpenMode from typing_extensions import TypeAlias import tiledbsoma as soma @@ -68,15 +69,15 @@ def is_initialized(self) -> bool: @abstractmethod def _array_exists( - uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + self, uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None ) -> bool: pass @abstractmethod - def _array_open(self, mode: str) -> None: + def _array_open(self, mode: OpenMode, tiledb_timestamp: int | None = None) -> None: pass - def _open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + def _open(self, *, mode: OpenMode, tiledb_timestamp: int | None = None) -> None: assert self.A.closed tiledb_timestamp = None # TODO/XXX: no time-travel for now. FIXME self._array_open(mode=mode, tiledb_timestamp=tiledb_timestamp) @@ -96,10 +97,6 @@ def _close(self) -> None: self.closed = True self.mode = None - @abstractmethod - def _reopen(self, mode: str) -> None: - pass - ## ## ---- Open/close state ## @@ -125,7 +122,7 @@ def close(self) -> None: @precondition(lambda self: self.closed) @rule(mode=st.sampled_from(["r", "w"])) - def open(self, mode: str) -> None: + def open(self, mode: OpenMode) -> None: # TODO: time travel self._open(mode=mode) @@ -137,7 +134,7 @@ def open(self, mode: str) -> None: lambda self: not HT_TEST_CONFIG["sc-61118_workaround"] or self.mode != "w" ) # TODO - fails due to loss of metadata on reopen from w->r. See sc-61118. Remove when fixed. @rule(mode=st.sampled_from(["r", "w"])) - def reopen(self, mode: str) -> None: + def reopen(self, mode: OpenMode) -> None: assert not self.A.closed assert not self.closed assert self.mode is not None @@ -238,7 +235,7 @@ def __init__(self, shapes_factory: ShapesFactory) -> None: super().__init__() self.shapes_factory = shapes_factory - def setup(self, type, shape, array) -> None: + def setup(self, type: pa.DataType, shape: tuple[int, ...], array) -> None: super().setup(array) self.type = type self.schema = pa.schema( diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index 9aacac5903..ba06c0fa29 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -1,9 +1,9 @@ -"""Config settings for all Hypothesis tests. Primarily used to toggle bug work-arounds, etc. +"""Config settings for all Hypothesis tests. Primarily used to toggle bug workarounds, etc. """ HT_TEST_CONFIG = { # - # Defect work-arounds, while awaiting a fix + # Defect workarounds, while awaiting fixes # # creating array with timestamp==0 fails in 1.15 (regression) "sc-61054_workaround": True, diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py index ba6a157f52..91f8fdc8d3 100644 --- a/apis/python/tests/ht/_ht_util.py +++ b/apis/python/tests/ht/_ht_util.py @@ -216,7 +216,7 @@ def arrow_datatypes(draw: st.DrawFn) -> pa.DataType: ) -def ndarray_datatype() -> st.SearchStrategy: +def ndarray_datatype() -> st.SearchStrategy[pa.DataType]: """Return a type that can be stored in a SOMA NDArray.""" return st.from_type(pa.DataType).filter( lambda t: ( @@ -229,7 +229,7 @@ def ndarray_datatype() -> st.SearchStrategy: ) -def dataframe_datatype() -> st.SearchStrategy: +def dataframe_datatype() -> st.SearchStrategy[pa.DataType]: """Return type that can be stored in a DataFrame column.""" def is_dataframe_value_type(dt: pa.DataType) -> bool: @@ -654,8 +654,8 @@ def schemas_equal( s1: pa.Schema, s2: pa.Schema, *, - ignore_field_order=False, - large_type_equivalence=False, + ignore_field_order: bool = False, + large_type_equivalence: bool = False, ) -> bool: """NB: assumes all field names are unique! Raises if not. diff --git a/apis/python/tests/ht/_ledger.py b/apis/python/tests/ht/_ledger.py index ded0f4b4a9..fbd6419dae 100644 --- a/apis/python/tests/ht/_ledger.py +++ b/apis/python/tests/ht/_ledger.py @@ -24,7 +24,7 @@ def get_entries(path: str | pathlib.Path) -> set[str]: children = [p.relative_to(dir).as_posix() for p in dir.iterdir()] entries = [c for c in children if re.match(r"__[0-9]+_[0-9]+_[0-9a-fA-F]+", c)] entries.sort() - return entries + return set(entries) LedgerEntryDataType = TypeVar("LedgerEntryDataType") @@ -45,14 +45,17 @@ def consolidate_with( pass -class Ledger(Generic[LedgerEntryDataType]): +LedgerEntryType = TypeVar("LedgerEntryType", bound="LedgerEntry") + + +class Ledger(Generic[LedgerEntryType]): def __init__( self, - initial_entry: LedgerEntry[LedgerEntryDataType], + initial_entry: LedgerEntryType, *, allows_duplicates: bool = False, ) -> None: - self.entries: list[LedgerEntry[LedgerEntryDataType]] = [initial_entry] + self.entries: list[LedgerEntryType] = [initial_entry] self.initial_entry = ( initial_entry # XXX: do we need this or can we use entries[0]? ) @@ -72,7 +75,7 @@ def __repr__(self) -> str: + "\n" ) - def read(self, timestamp_ms: int) -> LedgerEntry[LedgerEntryDataType]: + def read(self, timestamp_ms: int) -> LedgerEntryType: """Return a single ledger entry representing all writes <= timestamp""" assert len(self.entries) > 0 entries_to_consolidate = sorted( @@ -86,7 +89,7 @@ def read(self, timestamp_ms: int) -> LedgerEntry[LedgerEntryDataType]: ) return consolidated_result - def write(self, entry: LedgerEntry[LedgerEntryDataType]) -> None: + def write(self, entry: LedgerEntryType) -> None: """Write new entry to the ledger.""" assert entry.timestamp_ms >= 0 assert type(entry) is type(self.initial_entry) diff --git a/apis/python/tests/ht/conftest.py b/apis/python/tests/ht/conftest.py index d8ddfdf8cb..742b2d8bd7 100644 --- a/apis/python/tests/ht/conftest.py +++ b/apis/python/tests/ht/conftest.py @@ -31,18 +31,8 @@ def ht_test_config() -> dict[str, Any]: @pytest.fixture -def concurrency() -> int | None: - return None - - -@pytest.fixture -def context(concurrency: int | None) -> soma.SOMATileDBContext: - if concurrency is None: - return soma.SOMATileDBContext() - else: - return soma.SOMATileDBContext( - tiledb_config={"soma.compute_concurrency_level": f"{concurrency}"} - ) +def context() -> soma.SOMATileDBContext: + return soma.SOMATileDBContext() # Register Hypothesis strategies for use with `strategies.from_type()` diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index 7ea904ee05..f4e1260033 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -14,6 +14,7 @@ from hypothesis.stateful import initialize, invariant, precondition, rule from more_itertools import pairwise from packaging.version import Version +from somacore.options import OpenMode import tiledbsoma as soma @@ -23,7 +24,6 @@ arrow_array, arrow_schema, dataframe_datatype, - # df_to_table, from_datatype, pad_array, schemas_equal, @@ -72,7 +72,7 @@ class EnumerationMetadata(Generic[T]): type: pa.DictionaryType max_categories: int = attrs.field(init=False) - categories: tuple[T] = attrs.field(factory=tuple) + categories: tuple[T, ...] = attrs.field(factory=tuple) def __attrs_post_init__(self): # we are frozen, so use __setattr__ to bypass. @@ -595,7 +595,9 @@ def get_max_size() -> int: if n_splits > 0: split_points = draw(splitss(n_splits=n_splits, max_value=len(tbl))) split_points = [0] + split_points + [len(tbl)] - tbl = pa.concat_tables([tbl[st:sp] for st, sp in pairwise(split_points)]) + tbl = pa.concat_tables( + [tbl[start:end] for start, end in pairwise(split_points)] + ) # pad, sometimes if draw(st.booleans()): @@ -658,13 +660,15 @@ def setup( ) def _array_exists( - uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + self, uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None ) -> bool: return soma.DataFrame.exists( uri, context=context, tiledb_timestamp=tiledb_timestamp ) - def _array_open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + def _array_open( + self, *, mode: OpenMode, tiledb_timestamp: int | None = None + ) -> None: self.A = soma.DataFrame.open( self.uri, mode=mode, context=self.context, tiledb_timestamp=tiledb_timestamp ) diff --git a/apis/python/tests/ht/test_ht_densendarray.py b/apis/python/tests/ht/test_ht_densendarray.py index 8dca2eefc6..c67870e71e 100644 --- a/apis/python/tests/ht/test_ht_densendarray.py +++ b/apis/python/tests/ht/test_ht_densendarray.py @@ -4,7 +4,6 @@ from typing import Any -import hypothesis as ht import hypothesis.extra.numpy as ht_np import numpy as np import pyarrow as pa @@ -16,10 +15,9 @@ precondition, rule, ) +from somacore.options import OpenMode -import tiledbsoma import tiledbsoma as soma -import tiledbsoma._sparse_nd_array from tests.ht._array_state_machine import SOMANDArrayStateMachine from tests.ht._ht_test_config import HT_TEST_CONFIG @@ -71,7 +69,7 @@ def dense_array_shape( @st.composite -def dense_indices(draw: st.DrawFn, shape: tuple[int, ...]) -> tuple[int | slice]: +def dense_indices(draw: st.DrawFn, shape: tuple[int, ...]) -> tuple[int | slice, ...]: """Strategy to return DenseNDArray slicing, which currently allows: * None - synonym for slice(None) * slice - with step == 1 ONLY @@ -118,7 +116,7 @@ def fill_value_for_type(type: pa.DataType) -> Any: return DEFAULT_FILL_VALUE[type] -def densendarray_datatype() -> ht.SearchStrategy[pa.DataType]: +def densendarray_datatype() -> st.SearchStrategy[pa.DataType]: # Arrow Tensor doesn't support bool_ or timestamp, and that is the only # read accessor we have. So for now, don't test those types. if HT_TEST_CONFIG["sc-61743_workaround"]: @@ -177,13 +175,15 @@ def setup(self, type, shape) -> None: ) def _array_exists( - uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + self, uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None ) -> bool: return soma.DenseNDArray.exists( uri, context=context, tiledb_timestamp=tiledb_timestamp ) - def _array_open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + def _array_open( + self, *, mode: OpenMode, tiledb_timestamp: int | None = None + ) -> None: self.A = soma.DenseNDArray.open( self.uri, mode=mode, context=self.context, tiledb_timestamp=tiledb_timestamp ) diff --git a/apis/python/tests/ht/test_ht_sparsendarray.py b/apis/python/tests/ht/test_ht_sparsendarray.py index 738b5b9051..7e7ea7c05a 100644 --- a/apis/python/tests/ht/test_ht_sparsendarray.py +++ b/apis/python/tests/ht/test_ht_sparsendarray.py @@ -4,8 +4,7 @@ import datetime import shutil -import typing -from typing import Any, Union +from typing import Any, Dict, Mapping, Optional, Sequence, Tuple, Union import hypothesis as ht import numpy as np @@ -19,8 +18,8 @@ precondition, rule, ) +from somacore.options import OpenMode -import tiledbsoma import tiledbsoma as soma import tiledbsoma._sparse_nd_array @@ -162,8 +161,8 @@ def test_fuzz_SparseNDArray_create( tmp_path, uri: str, type: pa.DataType, - shape: typing.Sequence[typing.Optional[int]], - platform_config: typing.Dict[str, typing.Mapping[str, Any]] | object | None, + shape: Sequence[Optional[int]], + platform_config: Dict[str, Mapping[str, Any]] | object | None, context: tiledbsoma.SOMATileDBContext | None, tiledb_timestamp: int | datetime.datetime | None, ) -> None: @@ -196,7 +195,7 @@ def __init__(self) -> None: super().__init__(shapes_factory=sparse_array_shape) @initialize(type=ndarray_datatype(), shape=sparse_array_shape(allow_none=True)) - def setup(self, type, shape) -> None: + def setup(self, type: pa.DataType, shape: Tuple[int | None, ...]) -> None: super().setup( type, shape, @@ -219,13 +218,15 @@ def setup(self, type, shape) -> None: ) def _array_exists( - uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None + self, uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None ) -> bool: return soma.SparseNDArray.exists( uri, context=context, tiledb_timestamp=tiledb_timestamp ) - def _array_open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None: + def _array_open( + self, *, mode: OpenMode, tiledb_timestamp: int | None = None + ) -> None: self.A = soma.SparseNDArray.open( self.uri, mode=mode, context=self.context, tiledb_timestamp=tiledb_timestamp )