single-cell-data · bkmartinjr · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/apis/python/requirements_dev.txt b/apis/python/requirements_dev.txt
@@ -5,3 +5,5 @@ ruff
 sparse
 typeguard==4.4.0
 types-setuptools
+more-itertools
+hypothesis
diff --git a/apis/python/tests/ht/README.md b/apis/python/tests/ht/README.md
@@ -0,0 +1,41 @@
+# SOMA Hypothesis-based tests
+
+This folder contains Hypothesis-based tests and supporting code. All will run within the standard pytest
+framework and will run in the course of normal pytest execution (including during CI).
+
+## Recreating test failures
+
+Property-based tests generate unique test invocations each time they are run. As such, test failures in CI
+may not immediately reproduce if you re-run a test. The test log files should contain enough
+information to reproduce the error, including the random number seed used for test selection, and the
+so-called "blob" that allows Hypothesis to recreate a particular test.
+
+The [Hypothesis documentation](https://hypothesis.readthedocs.io/) has information on reproducing
+test failures.
+
+## Exhaustive testing
+
+The more test cases Hypothesis generates, the more likely it is to find a bug. This is at odds with
+the need for our CI pipeline to complete in a "reasonable" amount of time.
+
+The default configuration is suitable for use in CI, i.e., all tests will complete fairly quickly. Please do not
+change this behavior.
+
+In the course of development, it is often useful to more exhaustively search for test cases.
+A Hypothesis profile has been defined for this case called `expensive`. You can run the tests in this
+mode:
+
+> pytest apis/python/tests/ --hypothesis-profile=expensive
+
+In this mode, tests will run significantly longer (very roughly, 100X longer than the default) and cover
+many more test conditions. Because each invocation of the test starts with a unique random seed, you
+can repeat this invocation until you are satisfied with your test coverage.
+
+## Configuration
+
+The `_ht_test_config.py` file is used to configure the tests. The most common use case is a config flag
+which enables a defect work-around, while the issue is being resolved.
+
+## For More Information
+
+See the [Hypothesis documentation](https://hypothesis.readthedocs.io/)
diff --git a/apis/python/tests/ht/__init__.py b/apis/python/tests/ht/__init__.py
diff --git a/apis/python/tests/ht/_array_state_machine.py b/apis/python/tests/ht/_array_state_machine.py
@@ -0,0 +1,297 @@
+"""Hypothesis rule-based statemachine ABC for SOMAArray.
+
+Intended to be specialized for SparseNDArray, et al.
+"""
+
+from __future__ import annotations
+
+import math
+import re
+from abc import abstractmethod
+from typing import Any, Literal, Protocol, Union
+
+import numpy as np
+import pyarrow as pa
+from hypothesis import strategies as st
+from hypothesis.stateful import RuleBasedStateMachine, invariant, precondition, rule
+from typing_extensions import TypeAlias
+
+import tiledbsoma as soma
+
+from tests.ht._ht_test_config import HT_TEST_CONFIG
+from tests.ht._ledger import Ledger, PyDictLedgerEntry
+
+SOMAArray: TypeAlias = Union[soma.DataFrame, soma.SparseNDArray, soma.DenseNDArray]
+
+
+class SOMAArrayStateMachine(RuleBasedStateMachine):
+    """Abstract base class for a soma array Hypothesis state machine"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.context = soma.SOMATileDBContext()
+        self.closed: bool = True
+        self.mode: Literal["r", "w"] | None = None
+        self.A: SOMAArray | None = None
+        self.uri = self.TestCase.tmp_path_factory.mktemp(
+            f"{self.__class__.__name__}-"
+        ).as_posix()
+
+    def setup(self, A: SOMAArray) -> None:
+        assert isinstance(A, (soma.DataFrame, soma.SparseNDArray, soma.DenseNDArray))
+        assert A.mode == "w" and not A.closed
+        self.A = A
+        self.create_timestamp_ms = self.A.tiledb_timestamp_ms
+        self.closed = self.A.closed
+        self.mode = self.A.mode
+        self.metadata_ledger = Ledger[PyDictLedgerEntry](
+            initial_entry=PyDictLedgerEntry(
+                data=dict(self.A.metadata),
+                timestamp_ms=self.A.tiledb_timestamp_ms,
+                name="initial entry",
+            ),
+            allows_duplicates=False,
+        )
+        self.pending_metadata: dict[str, Any] | None = None
+
+    def teardown(self) -> None:
+        if self.A is not None:
+            if not self.closed:
+                self.A.close()
+            self.A = None
+
+        super().teardown()
+
+    @property
+    def is_initialized(self) -> bool:
+        return self.A is not None
+
+    @abstractmethod
+    def _array_exists(
+        uri: str, context: soma.SOMATileDBContext, tiledb_timestamp: int | None
+    ) -> bool:
+        pass
+
+    @abstractmethod
+    def _array_open(self, mode: str) -> None:
+        pass
+
+    def _open(self, *, mode: str, tiledb_timestamp: int | None = None) -> None:
+        assert self.A.closed
+        tiledb_timestamp = None  # TODO/XXX: no time-travel for now. FIXME
+        self._array_open(mode=mode, tiledb_timestamp=tiledb_timestamp)
+        assert self.A is not None
+        self.closed = False
+        self.mode = mode
+
+    def _close(self) -> None:
+        assert not self.A.closed
+        if self.pending_metadata is not None:
+            self.metadata_ledger.write(
+                PyDictLedgerEntry(self.A.tiledb_timestamp_ms, "", self.pending_metadata)
+            )
+            self.pending_metadata = None
+
+        self.A.close()
+        self.closed = True
+        self.mode = None
+
+    @abstractmethod
+    def _reopen(self, mode: str) -> None:
+        pass
+
+    ##
+    ## ---- Open/close state
+    ##
+
+    @precondition(lambda self: self.is_initialized)
+    def check_exists(self) -> None:
+        assert self._array_exists(self.uri, self.context, None)
+
+    @precondition(lambda self: self.is_initialized)
+    @invariant()
+    def check_mode(self) -> None:
+        assert self.closed or self.mode == self.A.mode
+
+    @precondition(lambda self: self.is_initialized)
+    @invariant()
+    def check_closed(self) -> None:
+        assert self.closed == self.A.closed
+
+    @precondition(lambda self: not self.closed)
+    @rule()
+    def close(self) -> None:
+        self._close()
+
+    @precondition(lambda self: self.closed)
+    @rule(mode=st.sampled_from(["r", "w"]))
+    def open(self, mode: str) -> None:
+        # TODO: time travel
+        self._open(mode=mode)
+
+    @precondition(
+        lambda self: not HT_TEST_CONFIG["sc-61123_workaround"]
+    )  # TODO: this entire rule disabled until sc-61123 fixed.
+    @precondition(lambda self: not self.closed)
+    @precondition(
+        lambda self: not HT_TEST_CONFIG["sc-61118_workaround"] or self.mode != "w"
+    )  # TODO - fails due to loss of metadata on reopen from w->r. See sc-61118. Remove when fixed.
+    @rule(mode=st.sampled_from(["r", "w"]))
+    def reopen(self, mode: str) -> None:
+        assert not self.A.closed
+        assert not self.closed
+        assert self.mode is not None
+        self.A = self.A.reopen(
+            mode,
+            tiledb_timestamp=None,  # no time-travel for now
+        )
+        self.mode = mode
+        assert self.A.mode == mode and not self.A.closed
+
+    ##
+    ## --- metadata
+    ##
+    METADATA_KEY_ALPHABET = st.characters(codec="utf-8", exclude_characters=["\x00"])
+    METADATA_KEYS = st.text(
+        min_size=0, max_size=4096, alphabet=METADATA_KEY_ALPHABET
+    ).filter(lambda k: not k.startswith("soma_"))
+    METADATA_VALUE_ALPHABET = st.characters(codec="utf-8", exclude_characters=["\x00"])
+    METADATA_VALUES = st.one_of(
+        st.text(alphabet=METADATA_VALUE_ALPHABET, min_size=0)
+        | st.integers(
+            min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max
+        )
+        | st.floats()
+    )
+    IGNORE_KEYS = re.compile(r"^soma_.*$")
+
+    @classmethod
+    def filter_metadata(cls, d: dict[str, Any]) -> dict[str, Any]:
+        """Apply the "ignore" regex to dict keys, returning the filtered dict."""
+        return {k: v for k, v in d.items() if not cls.IGNORE_KEYS.match(k)}
+
+    @precondition(lambda self: not self.closed)
+    @invariant()
+    def check_metadata(self) -> None:
+        array_metadata = self.filter_metadata(dict(self.A.metadata))
+        expected_metadata = self.filter_metadata(
+            self.metadata_ledger.read(timestamp_ms=self.A.tiledb_timestamp_ms).to_dict()
+            if self.pending_metadata is None
+            else self.pending_metadata
+        )
+        assert set(array_metadata.keys()) == set(expected_metadata.keys())
+        for k in array_metadata.keys():
+            if isinstance(array_metadata[k], float) and math.isnan(array_metadata[k]):
+                assert math.isnan(expected_metadata[k])
+                continue
+            assert array_metadata[k] == expected_metadata[k]
+
+    @precondition(
+        lambda self: not self.closed and self.mode == "w" and len(self.A.metadata) < 100
+    )
+    @rule(k=METADATA_KEYS, v=METADATA_VALUES)
+    def set_metadata(self, k: str, v: str | int | float) -> None:
+        self.A.metadata[k] = v
+        if self.pending_metadata is None:
+            self.pending_metadata = self.metadata_ledger.read(
+                self.A.tiledb_timestamp_ms
+            ).to_dict()
+        self.pending_metadata[k] = v
+
+    @precondition(
+        lambda self: not self.closed
+        and self.mode == "w"
+        and len(self.filter_metadata(self.A.metadata))
+    )
+    @precondition(lambda self: not self.closed)
+    @rule(data=st.data())
+    def del_metadata(self, data: st.DataObject) -> None:
+        if self.pending_metadata is None:
+            self.pending_metadata = self.metadata_ledger.read(
+                self.A.tiledb_timestamp_ms
+            ).to_dict()
+
+        k = data.draw(
+            st.sampled_from(
+                sorted(list(self.filter_metadata(self.pending_metadata).keys()))
+            )
+        )
+        del self.A.metadata[k]
+        del self.pending_metadata[k]
+
+
+class ShapesFactory(Protocol):
+    """Factory for a strategy returning ndarray shape."""
+
+    def __call__(
+        self,
+        *,
+        min_shape: tuple[int, ...] | None = None,
+        max_shape: tuple[int, ...] | None = None,
+    ) -> st.SearchStrategy[tuple[int | None, ...]]: ...
+
+
+class SOMANDArrayStateMachine(SOMAArrayStateMachine):
+    """Abstract base class for NDArray Hypothesis state machine."""
+
+    def __init__(self, shapes_factory: ShapesFactory) -> None:
+        super().__init__()
+        self.shapes_factory = shapes_factory
+
+    def setup(self, type, shape, array) -> None:
+        super().setup(array)
+        self.type = type
+        self.schema = pa.schema(
+            [
+                pa.field(f"soma_dim_{n}", pa.int64(), nullable=False)
+                for n in range(len(shape))
+            ]
+            + [pa.field("soma_data", self.type, nullable=False)]
+        )
+        assert all((shape[i] or 1) == self.A.shape[i] for i in range(len(shape)))
+        assert self.schema == self.A.schema
+        self.shape = tuple(
+            (shape[i] or 1) for i in range(len(shape))
+        )  # XXX TODO: shape should be a ledger
+
+    ##
+    ## --- schema
+    ##
+
+    @precondition(lambda self: not self.closed)
+    @invariant()
+    def check_schema(self) -> None:
+        schema = self.A.schema
+        assert len(schema.types) == len(self.shape) + 1
+        assert schema.field("soma_data").type == self.type
+        for idx in range(len(self.shape)):
+            assert schema.names[idx] == f"soma_dim_{idx}"
+            assert schema.types[idx] == pa.int64()
+            assert schema.field(f"soma_dim_{idx}").type == pa.int64()
+        assert self.A.schema == self.schema
+
+    ##
+    ## --- shape
+    ##
+
+    @precondition(lambda self: not self.closed)
+    @invariant()
+    def check_shape(self) -> None:
+        assert hasattr(self.A, "shape")  # sc-61123
+        assert self.A.shape == tuple(
+            (s or 1) for s in self.shape
+        ), f"Unexpected shape in {self.A}: had {self.A.shape}, expected {self.shape}"
+        assert self.A.ndim == len(self.shape)
+
+    @precondition(lambda self: self.closed or self.mode == "w")
+    @rule(data=st.data())
+    def expand_shape(self, data: st.DataObject) -> None:
+        if self.closed:
+            self._open(mode="w")
+        assert self.mode == "w"
+        new_shape = data.draw(
+            self.shapes_factory(min_shape=self.shape, max_shape=self.A.maxshape)
+        )
+        self.A.resize(new_shape)
+        self.shape = new_shape
+        self._close()  # resize is committed upon close
diff --git a/apis/python/tests/ht/_arrow_util.py b/apis/python/tests/ht/_arrow_util.py
@@ -0,0 +1,47 @@
+""" Various utilities for dealing with Arrow data."""
+
+from __future__ import annotations
+
+import pyarrow as pa
+
+
+def combine_chunks(a: pa.ChunkedArray) -> pa.Array:
+    """Semantically identical to pa.ChunkedArray.combine_chunks, but handles the
+    `large_` types which are unimplemented by pyarrow.
+    """
+    type = a.type
+
+    if pa.types.is_large_string(type):
+        return a.cast(pa.string()).combine_chunks().cast(type)
+
+    if pa.types.is_large_binary(type):
+        return a.cast(pa.binary()).combine_chunks().cast(type)
+
+    if pa.types.is_dictionary(type):
+        if pa.types.is_large_string(type.value_type):
+            return (
+                a.cast(
+                    pa.dictionary(
+                        index_type=type.index_type,
+                        value_type=pa.string(),
+                        ordered=type.ordered,
+                    )
+                )
+                .combine_chunks()
+                .cast(type)
+            )
+
+        if pa.types.is_large_binary(type.value_type):
+            return (
+                a.cast(
+                    pa.dictionary(
+                        index_type=type.index_type,
+                        value_type=pa.binary(),
+                        ordered=type.ordered,
+                    )
+                )
+                .combine_chunks()
+                .cast(type)
+            )
+
+    return a.combine_chunks()