From f5301e615d259e400b1d511b968c86880c038fae Mon Sep 17 00:00:00 2001
From: Joseph Abbott <joseph.william.abbott@gmail.com>
Date: Tue, 26 Mar 2024 19:06:28 +0000
Subject: [PATCH 1/2] Port sample/feature selection from equisolve

---
 .../metatensor/learn/selection/_selection.py  | 248 ++++++++++++++++++
 .../selection/feature_selection/__init__.py   |   4 +
 .../feature_selection/feature_selection.py    |  93 +++++++
 .../selection/sample_selection/__init__.py    |   4 +
 .../sample_selection/sample_selection.py      |  94 +++++++
 .../tests/feature_selection.py                | 129 +++++++++
 .../tests/sample_selection.py                 | 129 +++++++++
 .../metatensor-learn/tests/selection_utils.py | 172 ++++++++++++
 tox.ini                                       |   3 +
 9 files changed, 876 insertions(+)
 create mode 100644 python/metatensor-learn/metatensor/learn/selection/_selection.py
 create mode 100644 python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py
 create mode 100644 python/metatensor-learn/metatensor/learn/selection/feature_selection/feature_selection.py
 create mode 100644 python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py
 create mode 100644 python/metatensor-learn/metatensor/learn/selection/sample_selection/sample_selection.py
 create mode 100644 python/metatensor-learn/tests/feature_selection.py
 create mode 100644 python/metatensor-learn/tests/sample_selection.py
 create mode 100644 python/metatensor-learn/tests/selection_utils.py

diff --git a/python/metatensor-learn/metatensor/learn/selection/_selection.py b/python/metatensor-learn/metatensor/learn/selection/_selection.py
new file mode 100644
index 000000000..670dd8964
--- /dev/null
+++ b/python/metatensor-learn/metatensor/learn/selection/_selection.py
@@ -0,0 +1,248 @@
+from typing import Type, Union
+
+import numpy as np
+import skmatter._selection
+
+import metatensor
+
+from .._backend import Labels, TensorBlock, TensorMap
+
+
+class GreedySelector:
+    """
+    Wraps :py:class:`skmatter._selection.GreedySelector` for a TensorMap.
+
+    The class creates a selector for each block. The selection will be done based the
+    values of each :py:class:`TensorBlock`. Gradients will not be considered for the
+    selection.
+    """
+
+    def __init__(
+        self,
+        selector_class: Type[skmatter._selection.GreedySelector],
+        selection_type: str,
+        n_to_select: Union[int, dict],
+        **selector_arguments,
+    ) -> None:
+        self._selector_class = selector_class
+        self._selection_type = selection_type
+        self._n_to_select = n_to_select
+        self._selector_arguments = selector_arguments
+
+        self._selector_arguments["selection_type"] = self._selection_type
+        self._support = None
+        self._select_distance = None
+
+    @property
+    def selector_class(self) -> Type[skmatter._selection.GreedySelector]:
+        """
+        The class to perform the selection. Usually one of 'FPS' or 'CUR'.
+        """
+        return self._selector_class
+
+    @property
+    def selection_type(self) -> str:
+        """
+        Whether to choose a subset of columns ('feature') or rows ('sample').
+        """
+        return self._selection_type
+
+    @property
+    def selector_arguments(self) -> dict:
+        """
+        Arguments passed to the ``selector_class``.
+        """
+        return self._selector_arguments
+
+    @property
+    def support(self) -> TensorMap:
+        """
+        TensorMap containing the support.
+        """
+        if self._support is None:
+            raise ValueError("No selections. Call fit method first.")
+
+        return self._support
+
+    @property
+    def get_select_distance(self) -> TensorMap:
+        """
+        Returns a TensorMap containing the Hausdorff distances.
+
+        For each block, the metadata of the relevant axis (i.e. samples or properties,
+        depending on whether sample or feature selection is being performed) is sorted
+        and returned according to the Hausdorff distance, in descending order.
+        """
+        if self._selector_class == skmatter._selection._CUR:
+            raise ValueError("Hausdorff distances not available for CUR in skmatter.")
+        if self._select_distance is None:
+            raise ValueError("No Hausdorff distances. Call fit method first.")
+
+        return self._select_distance
+
+    def fit(self, X: TensorMap, warm_start: bool = False) -> None:
+        """
+        Learn the features to select.
+
+        :param X: the input training vectors to fit.
+        :param warm_start: bool, whether the fit should continue after having already
+            run, after increasing `n_to_select`. Assumes it is called with the same X.
+        """
+        # Check that we have only 0 or 1 comoponent axes
+        if len(X.component_names) == 0:
+            has_components = False
+        elif len(X.component_names) == 1:
+            has_components = True
+        else:
+            assert len(X.component_names) > 1
+            raise ValueError("Can only handle TensorMaps with a single component axis.")
+
+        support_blocks = []
+        if self._selector_class == skmatter._selection._FPS:
+            hausdorff_blocks = []
+        for key, block in X.items():
+            # Parse the n_to_select argument
+            max_n = (
+                len(block.properties)
+                if self._selection_type == "feature"
+                else len(block.samples)
+            )
+            if isinstance(self._n_to_select, int):
+                if (
+                    self._n_to_select == -1
+                ):  # set to the number of samples/features for this block
+                    tmp_n_to_select = max_n
+                else:
+                    tmp_n_to_select = self._n_to_select
+
+            elif isinstance(self._n_to_select, dict):
+                tmp_n_to_select = self._n_to_select[tuple(key.values)]
+            else:
+                raise ValueError("n_to_select must be an int or a dict.")
+
+            if not (0 < tmp_n_to_select <= max_n):
+                raise ValueError(
+                    f"n_to_select ({tmp_n_to_select}) must > 0 and <= the number of "
+                    f"{self._selection_type} for the given block ({max_n})."
+                )
+
+            selector = self.selector_class(
+                n_to_select=tmp_n_to_select, **self.selector_arguments
+            )
+
+            # If the block has components, reshape to a 2D array such that the
+            # components expand along the dimension *not* being selected.
+            block_vals = block.values
+            if has_components:
+                n_components = len(block.components[0])
+                if self._selection_type == "feature":
+                    # Move components into samples
+                    block_vals = block_vals.reshape(
+                        (block_vals.shape[0] * n_components, block_vals.shape[2])
+                    )
+                else:
+                    assert self._selection_type == "sample"
+                    # Move components into features
+                    block_vals = block.values.reshape(
+                        (block_vals.shape[0], block_vals.shape[2] * n_components)
+                    )
+
+            # Fit on the block values
+            selector.fit(block_vals, warm_start=warm_start)
+
+            # Build the support TensorMap. In this case we want the mask to be a
+            # list of bools, such that the original order of the metadata is
+            # preserved.
+            supp_mask = selector.get_support()
+            if self._selection_type == "feature":
+                supp_samples = Labels.single()
+                supp_properties = Labels(
+                    names=block.properties.names,
+                    values=block.properties.values[supp_mask],
+                )
+            elif self._selection_type == "sample":
+                supp_samples = Labels(
+                    names=block.samples.names, values=block.samples.values[supp_mask]
+                )
+                supp_properties = Labels.single()
+
+            supp_vals = np.zeros(
+                [len(supp_samples), len(supp_properties)], dtype=np.int32
+            )
+            support_blocks.append(
+                TensorBlock(
+                    values=supp_vals,
+                    samples=supp_samples,
+                    components=[],
+                    properties=supp_properties,
+                )
+            )
+
+            if self._selector_class == skmatter._selection._FPS:
+                # Build the Hausdorff TensorMap, only for FPS. In this case we want the
+                # mask to be a list of int such that the samples/properties are
+                # reordered according to the Hausdorff distance.
+                haus_mask = selector.get_support(indices=True, ordered=True)
+                if self._selection_type == "feature":
+                    haus_samples = Labels.single()
+                    haus_properties = Labels(
+                        names=block.properties.names,
+                        values=block.properties.values[haus_mask],
+                    )
+                elif self._selection_type == "sample":
+                    haus_samples = Labels(
+                        names=block.samples.names,
+                        values=block.samples.values[haus_mask],
+                    )
+                    haus_properties = Labels.single()
+
+                haus_vals = selector.hausdorff_at_select_[haus_mask].reshape(
+                    len(haus_samples), len(haus_properties)
+                )
+                hausdorff_blocks.append(
+                    TensorBlock(
+                        values=haus_vals,
+                        samples=haus_samples,
+                        components=[],
+                        properties=haus_properties,
+                    )
+                )
+
+        self._support = TensorMap(X.keys, support_blocks)
+        if self._selector_class == skmatter._selection._FPS:
+            self._select_distance = TensorMap(X.keys, hausdorff_blocks)
+
+        return self
+
+    def transform(self, X: TensorMap) -> TensorMap:
+        """
+        Reduce X to the selected features.
+
+        :param X: the input tensor.
+        :returns: the selected subset of the input.
+        """
+        blocks = []
+        for key, block in X.items():
+            block_support = self.support.block(key)
+
+            if self._selection_type == "feature":
+                new_block = metatensor.slice_block(
+                    block, "properties", block_support.properties
+                )
+            elif self._selection_type == "sample":
+                new_block = metatensor.slice_block(
+                    block, "samples", block_support.samples
+                )
+            blocks.append(new_block)
+
+        return TensorMap(X.keys, blocks)
+
+    def fit_transform(self, X: TensorMap, warm_start: bool = False) -> TensorMap:
+        """
+        Fit to data, then transform it.
+
+        :param X: TensorMap of the training vectors.
+        :param warm_start: bool, whether the fit should continue after having already
+            run, after increasing `n_to_select`. Assumes it is called with the same X.
+        """
+        return self.fit(X, warm_start=warm_start).transform(X)
diff --git a/python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py b/python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py
new file mode 100644
index 000000000..5d981efb2
--- /dev/null
+++ b/python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py
@@ -0,0 +1,4 @@
+from .feature_selection import CUR, FPS  # noqa
+
+
+___all__ = ["CUR", "FPS"]
diff --git a/python/metatensor-learn/metatensor/learn/selection/feature_selection/feature_selection.py b/python/metatensor-learn/metatensor/learn/selection/feature_selection/feature_selection.py
new file mode 100644
index 000000000..a82514fb9
--- /dev/null
+++ b/python/metatensor-learn/metatensor/learn/selection/feature_selection/feature_selection.py
@@ -0,0 +1,93 @@
+"""
+Wrappers for the feature selectors of `scikit-matter`_.
+
+.. _`scikit-matter`: https://scikit-matter.readthedocs.io/en/latest/selection.html
+"""
+
+from skmatter._selection import _CUR, _FPS
+
+from .._selection import GreedySelector
+
+
+class FPS(GreedySelector):
+    """
+    Transformer that performs Greedy Feature Selection using Farthest Point Sampling.
+
+    If `n_to_select` is an `int`, all blocks will have this many features selected. In
+    this case, `n_to_select` must be <= than the fewest number of features in any block.
+
+    If `n_to_select` is a dict, it must have keys that are tuples corresponding to the
+    key values of each block. In this case, the values of the `n_to_select` dict can be
+    int that specify different number of features to select for each block.
+
+    If `n_to_select` is -1, all features for every block will be selected. This is
+    useful, for instance, for plotting Hausdorff distances, which can be accessed
+    through the selector.haussdorf_at_select property after calling the fit() method.
+
+    Refer to :py:class:`skmatter.feature_selection.FPS` for full documentation.
+    """
+
+    def __init__(
+        self,
+        initialize=0,
+        n_to_select=None,
+        score_threshold=None,
+        score_threshold_type="absolute",
+        progress_bar=False,
+        full=False,
+        random_state=0,
+    ):
+        super().__init__(
+            selector_class=_FPS,
+            selection_type="feature",
+            initialize=initialize,
+            n_to_select=n_to_select,
+            score_threshold=score_threshold,
+            score_threshold_type=score_threshold_type,
+            progress_bar=progress_bar,
+            full=full,
+            random_state=random_state,
+        )
+
+
+class CUR(GreedySelector):
+    """
+    Transformer that performs Greedy Feature Selection with CUR.
+
+    If `n_to_select` is an `int`, all blocks will have this many features selected. In
+    this case, `n_to_select` must be <= than the fewest number of features in any block.
+
+    If `n_to_select` is a dict, it must have keys that are tuples corresponding to the
+    key values of each block. In this case, the values of the `n_to_select` dict can be
+    int that specify different number of features to select for each block.
+
+    If `n_to_select` is -1, all features for every block will be selected.
+
+    Refer to :py:class:`skmatter.feature_selection.CUR` for full documentation.
+    """
+
+    def __init__(
+        self,
+        recompute_every=1,
+        k=1,
+        tolerance=1e-12,
+        n_to_select=None,
+        score_threshold=None,
+        score_threshold_type="absolute",
+        progress_bar=False,
+        full=False,
+        random_state=0,
+    ):
+        super().__init__(
+            selector_class=_CUR,
+            selection_type="feature",
+            recompute_every=recompute_every,
+            k=k,
+            tolerance=tolerance,
+            n_to_select=n_to_select,
+            score_threshold=score_threshold,
+            score_threshold_type=score_threshold_type,
+            progress_bar=progress_bar,
+            full=full,
+            random_state=random_state,
+        )
diff --git a/python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py b/python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py
new file mode 100644
index 000000000..8d65c776c
--- /dev/null
+++ b/python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py
@@ -0,0 +1,4 @@
+from .sample_selection import CUR, FPS  # noqa
+
+
+___all__ = ["CUR", "FPS"]
diff --git a/python/metatensor-learn/metatensor/learn/selection/sample_selection/sample_selection.py b/python/metatensor-learn/metatensor/learn/selection/sample_selection/sample_selection.py
new file mode 100644
index 000000000..8276ee7db
--- /dev/null
+++ b/python/metatensor-learn/metatensor/learn/selection/sample_selection/sample_selection.py
@@ -0,0 +1,94 @@
+"""
+Wrappers for the sample selectors of `scikit-matter`_.
+
+.. _`scikit-matter`: https://scikit-matter.readthedocs.io/en/latest/selection.html
+"""
+
+from skmatter._selection import _CUR, _FPS
+
+from .._selection import GreedySelector
+
+
+class FPS(GreedySelector):
+    """
+    Transformer that performs Greedy Sample Selection using Farthest Point Sampling.
+
+    If `n_to_select` is an `int`, all blocks will have this many samples selected. In
+    this case, `n_to_select` must be <= than the fewest number of samples in any block.
+
+    If `n_to_select` is a dict, it must have keys that are tuples corresponding to the
+    key values of each block. In this case, the values of the `n_to_select` dict can be
+    int that specify different number of samples to select for each block.
+
+    If `n_to_select` is -1, all samples for every block will be selected. This is
+    useful, for instance, for plotting Hausdorff distances, which can be accessed
+    through the selector.haussdorf_at_select property method after calling the fit()
+    method.
+
+    Refer to :py:class:`skmatter.sample_selection.FPS` for full documentation.
+    """
+
+    def __init__(
+        self,
+        initialize=0,
+        n_to_select=None,
+        score_threshold=None,
+        score_threshold_type="absolute",
+        progress_bar=False,
+        full=False,
+        random_state=0,
+    ):
+        super().__init__(
+            selector_class=_FPS,
+            selection_type="sample",
+            initialize=initialize,
+            n_to_select=n_to_select,
+            score_threshold=score_threshold,
+            score_threshold_type=score_threshold_type,
+            progress_bar=progress_bar,
+            full=full,
+            random_state=random_state,
+        )
+
+
+class CUR(GreedySelector):
+    """
+    Transformer that performs Greedy Sample Selection using CUR.
+
+    If `n_to_select` is an `int`, all blocks will have this many samples selected. In
+    this case, `n_to_select` must be <= than the fewest number of samples in any block.
+
+    If `n_to_select` is a dict, it must have keys that are tuples corresponding to the
+    key values of each block. In this case, the values of the `n_to_select` dict can be
+    int that specify different number of samples to select for each block.
+
+    If `n_to_select` is -1, all samples for every block will be selected.
+
+    Refer to :py:class:`skmatter.sample_selection.CUR` for full documentation.
+    """
+
+    def __init__(
+        self,
+        recompute_every=1,
+        k=1,
+        tolerance=1e-12,
+        n_to_select=None,
+        score_threshold=None,
+        score_threshold_type="absolute",
+        progress_bar=False,
+        full=False,
+        random_state=0,
+    ):
+        super().__init__(
+            selector_class=_CUR,
+            selection_type="sample",
+            recompute_every=recompute_every,
+            k=k,
+            tolerance=tolerance,
+            n_to_select=n_to_select,
+            score_threshold=score_threshold,
+            score_threshold_type=score_threshold_type,
+            progress_bar=progress_bar,
+            full=full,
+            random_state=random_state,
+        )
diff --git a/python/metatensor-learn/tests/feature_selection.py b/python/metatensor-learn/tests/feature_selection.py
new file mode 100644
index 000000000..cf8913cb5
--- /dev/null
+++ b/python/metatensor-learn/tests/feature_selection.py
@@ -0,0 +1,129 @@
+"""
+Module to test FPS and CUR selectors in
+metatensor.learn.selection.feature_selection
+"""
+
+import numpy as np
+import pytest
+import skmatter.feature_selection
+from numpy.testing import assert_equal, assert_raises
+
+import metatensor
+from metatensor import Labels
+from metatensor.learn.selection.feature_selection import CUR, FPS
+
+from .selection_utils import (
+    random_single_block_no_components_tensor_map,
+    random_tensor_map_with_components,
+)
+
+
+@pytest.fixture
+def X1():
+    return random_single_block_no_components_tensor_map(
+        use_torch=False, use_metatensor_torch=False
+    )
+
+
+@pytest.fixture
+def X2():
+    return random_tensor_map_with_components(
+        use_torch=False, use_metatensor_torch=False
+    )
+
+
+@pytest.mark.parametrize(
+    "selector_class, skmatter_selector_class",
+    [(FPS, skmatter.feature_selection.FPS), (CUR, skmatter.feature_selection.CUR)],
+)
+def test_fit(X1, selector_class, skmatter_selector_class):
+    selector = selector_class(n_to_select=2)
+    selector.fit(X1)
+    support = selector.support[0].properties
+
+    skmatter_selector = skmatter_selector_class(n_to_select=2)
+    skmatter_selector.fit(X1[0].values)
+    skmatter_support = skmatter_selector.get_support(indices=True)
+    skmatter_support_labels = Labels(
+        names=["properties"],
+        values=np.array(
+            [[support_i] for support_i in skmatter_support], dtype=np.int32
+        ),
+    )
+
+    assert support == skmatter_support_labels
+
+
+@pytest.mark.parametrize(
+    "selector_class, skmatter_selector_class",
+    [(FPS, skmatter.feature_selection.FPS), (CUR, skmatter.feature_selection.CUR)],
+)
+def test_transform(X1, selector_class, skmatter_selector_class):
+    selector = selector_class(n_to_select=2)
+    selector.fit(X1)
+    X_trans = selector.transform(X1)
+
+    skmatter_selector = skmatter_selector_class(n_to_select=2)
+    skmatter_selector.fit(X1[0].values)
+    X_trans_skmatter = skmatter_selector.transform(X1[0].values)
+
+    assert_equal(X_trans[0].values, X_trans_skmatter)
+
+
+@pytest.mark.parametrize("selector_class", [FPS, CUR])
+def test_fit_transform(X1, selector_class):
+    selector = selector_class(n_to_select=2)
+
+    X_ft = selector.fit(X1).transform(X1)
+    metatensor.equal_raise(selector.fit_transform(X1), X_ft)
+
+
+@pytest.mark.parametrize("selector_class", [FPS])
+def test_get_select_distance(X2, selector_class):
+    selector = selector_class(n_to_select=3)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+
+    assert select_distance is not None
+
+    # Check distances sorted in descending order, with an inf as the first
+    # entry
+    for block in select_distance:
+        assert block.values[0][0] == np.inf
+        for i, val in enumerate(block.values[0][1:], start=1):
+            assert val < block.values[0][i - 1]
+
+
+@pytest.mark.parametrize("selector_class", [FPS])
+def test_get_select_distance_n_to_select(X2, selector_class):
+    # Case 1: select all features for every block (n_to_select = -1)
+    selector = selector_class(n_to_select=-1)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+    for block in select_distance:
+        assert len(block.properties) == 5
+
+    # Case 2: select subset of features but same for each block
+    n = 2
+    selector = selector_class(n_to_select=n)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+    for block in select_distance:
+        assert len(block.properties) == n
+
+    # Case 3: select subset of features but different for each block
+    keys = X2.keys
+    n = {tuple(key): 2 * i + 1 for i, key in enumerate(keys)}
+    selector = selector_class(n_to_select=n)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+    for i, key in enumerate(keys):
+        assert len(select_distance[key].properties) == 2 * i + 1
+
+
+@pytest.mark.parametrize("selector_class", [CUR])
+def test_get_select_distance_raises(X2, selector_class):
+    selector = selector_class(n_to_select=3)
+    selector.fit(X2)
+    with assert_raises(ValueError):
+        selector.get_select_distance
diff --git a/python/metatensor-learn/tests/sample_selection.py b/python/metatensor-learn/tests/sample_selection.py
new file mode 100644
index 000000000..253439797
--- /dev/null
+++ b/python/metatensor-learn/tests/sample_selection.py
@@ -0,0 +1,129 @@
+"""
+Module to test FPS and CUR selectors in
+metatensor.learn.selection.sample_selection
+"""
+
+import numpy as np
+import pytest
+import skmatter.sample_selection
+from numpy.testing import assert_equal, assert_raises
+
+import metatensor
+from metatensor import Labels
+from metatensor.learn.selection.sample_selection import CUR, FPS
+
+from .selection_utils import (
+    random_single_block_no_components_tensor_map,
+    random_tensor_map_with_components,
+)
+
+
+@pytest.fixture
+def X1():
+    return random_single_block_no_components_tensor_map(
+        use_torch=False, use_metatensor_torch=False
+    )
+
+
+@pytest.fixture
+def X2():
+    return random_tensor_map_with_components(
+        use_torch=False, use_metatensor_torch=False
+    )
+
+
+@pytest.mark.parametrize(
+    "selector_class, skmatter_selector_class",
+    [(FPS, skmatter.sample_selection.FPS), (CUR, skmatter.sample_selection.CUR)],
+)
+def test_fit(X1, selector_class, skmatter_selector_class):
+    selector = selector_class(n_to_select=2)
+    selector.fit(X1)
+    support = selector.support[0].samples
+
+    skmatter_selector = skmatter_selector_class(n_to_select=2)
+    skmatter_selector.fit(X1[0].values)
+    skmatter_support = skmatter_selector.get_support(indices=True)
+    skmatter_support_labels = Labels(
+        names=["sample", "structure"],
+        values=np.array(
+            [[support_i, support_i] for support_i in skmatter_support],
+            dtype=np.int32,
+        ),
+    )
+
+    assert support == skmatter_support_labels
+
+
+@pytest.mark.parametrize(
+    "selector_class, skmatter_selector_class",
+    [(FPS, skmatter.sample_selection.FPS), (CUR, skmatter.sample_selection.CUR)],
+)
+def test_transform(X1, selector_class, skmatter_selector_class):
+    selector = selector_class(n_to_select=2, random_state=0)
+    selector.fit(X1)
+    X_trans = selector.transform(X1)
+
+    skmatter_selector = skmatter_selector_class(n_to_select=2, random_state=0)
+    skmatter_selector.fit(X1[0].values)
+    X_trans_skmatter = X1[0].values[skmatter_selector.get_support()]
+    assert_equal(X_trans[0].values, X_trans_skmatter)
+
+
+@pytest.mark.parametrize("selector_class", [FPS, CUR])
+def test_fit_transform(X1, selector_class):
+    selector = selector_class(n_to_select=2)
+
+    X_ft = selector.fit(X1).transform(X1)
+    metatensor.equal_raise(selector.fit_transform(X1), X_ft)
+
+
+@pytest.mark.parametrize("selector_class", [FPS])
+def test_get_select_distance(X2, selector_class):
+    selector = selector_class(n_to_select=3)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+
+    assert select_distance is not None
+
+    # Check distances sorted in descending order, with an inf as the first
+    # entry
+    for block in select_distance:
+        assert block.values[0][0] == np.inf
+        for i, val in enumerate(block.values[0][1:], start=1):
+            assert val < block.values[0][i - 1]
+
+
+@pytest.mark.parametrize("selector_class", [FPS])
+def test_get_select_distance_n_to_select(X2, selector_class):
+    # Case 1: select all features for every block (n_to_select = -1)
+    selector = selector_class(n_to_select=-1)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+    for block in select_distance:
+        assert len(block.samples) == 4
+
+    # Case 2: select subset of features but same for each block
+    n = 2
+    selector = selector_class(n_to_select=n)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+    for block in select_distance:
+        assert len(block.samples) == n
+
+    # Case 3: select subset of features but different for each block
+    keys = X2.keys
+    n = {tuple(key): i for i, key in enumerate(keys, start=1)}
+    selector = selector_class(n_to_select=n)
+    selector.fit(X2)
+    select_distance = selector.get_select_distance
+    for i, key in enumerate(keys, start=1):
+        assert len(select_distance[key].samples) == i
+
+
+@pytest.mark.parametrize("selector_class", [CUR])
+def test_get_select_distance_raises(X2, selector_class):
+    selector = selector_class(n_to_select=3)
+    selector.fit(X2)
+    with assert_raises(ValueError):
+        selector.get_select_distance
diff --git a/python/metatensor-learn/tests/selection_utils.py b/python/metatensor-learn/tests/selection_utils.py
new file mode 100644
index 000000000..47860d48d
--- /dev/null
+++ b/python/metatensor-learn/tests/selection_utils.py
@@ -0,0 +1,172 @@
+import functools
+
+
+def random_single_block_no_components_tensor_map(use_torch, use_metatensor_torch):
+    """
+    Create a dummy tensor map to be used in tests. This is the same one as the
+    tensor map used in `tensor.rs` tests.
+    """
+    if not use_torch and use_metatensor_torch:
+        raise ValueError(
+            "torch.TensorMap cannot be created without torch.Tensor block values."
+        )
+    if use_metatensor_torch:
+        import torch
+
+        from metatensor.torch import Labels, TensorBlock, TensorMap
+
+        create_int32_array = functools.partial(torch.tensor, dtype=torch.int32)
+    else:
+        import numpy as np
+
+        from metatensor import Labels, TensorBlock, TensorMap
+
+        create_int32_array = functools.partial(np.array, dtype=np.int32)
+
+    if use_torch:
+        import torch
+
+        create_random_array = torch.rand
+    else:
+        import numpy as np
+
+        create_random_array = np.random.rand
+
+    block_1 = TensorBlock(
+        values=create_random_array(4, 2),
+        samples=Labels(
+            ["sample", "structure"],
+            create_int32_array([[0, 0], [1, 1], [2, 2], [3, 3]]),
+        ),
+        components=[],
+        properties=Labels(["properties"], create_int32_array([[0], [1]])),
+    )
+    positions_gradient = TensorBlock(
+        values=create_random_array(7, 3, 2),
+        samples=Labels(
+            ["sample", "structure", "center"],
+            create_int32_array(
+                [
+                    [0, 0, 1],
+                    [0, 0, 2],
+                    [1, 1, 0],
+                    [1, 1, 1],
+                    [1, 1, 2],
+                    [2, 2, 0],
+                    [3, 3, 0],
+                ],
+            ),
+        ),
+        components=[Labels(["direction"], create_int32_array([[0], [1], [2]]))],
+        properties=block_1.properties,
+    )
+    block_1.add_gradient("positions", positions_gradient)
+
+    cell_gradient = TensorBlock(
+        values=create_random_array(4, 6, 2),
+        samples=Labels(
+            ["sample", "structure"],
+            create_int32_array([[0, 0], [1, 1], [2, 2], [3, 3]]),
+        ),
+        components=[
+            Labels(
+                ["direction_xx_yy_zz_yz_xz_xy"],
+                create_int32_array([[0], [1], [2], [3], [4], [5]]),
+            )
+        ],
+        properties=block_1.properties,
+    )
+    block_1.add_gradient("cell", cell_gradient)
+
+    return TensorMap(Labels.single(), [block_1])
+
+
+def random_tensor_map_with_components(use_torch, use_metatensor_torch):
+    """
+    Create a dummy tensor map to be used in tests. This is the same one as the
+    tensor map used in `tensor.rs` tests.
+    """
+    if not use_torch and use_metatensor_torch:
+        raise ValueError(
+            "torch.TensorMap cannot be created without torch.Tensor block values."
+        )
+    if use_metatensor_torch:
+        import torch
+
+        from metatensor.torch import Labels, TensorBlock, TensorMap
+
+        create_int32_array = functools.partial(torch.tensor, dtype=torch.int32)
+    else:
+        import numpy as np
+
+        from metatensor import Labels, TensorBlock, TensorMap
+
+        create_int32_array = functools.partial(np.array, dtype=np.int32)
+
+    if use_torch:
+        import torch
+
+        create_random_array = torch.rand
+    else:
+        import numpy as np
+
+        create_random_array = np.random.rand
+
+    blocks = []
+    for i in range(3):
+        block = TensorBlock(
+            values=create_random_array(4, 2 * i + 1, 5),
+            samples=Labels(
+                ["sample", "structure"],
+                create_int32_array([[0, 0], [1, 1], [2, 2], [3, 3]]),
+            ),
+            components=[
+                Labels(names=["component"], values=np.arange(2 * i + 1).reshape(-1, 1)),
+            ],
+            properties=Labels(
+                ["properties"], create_int32_array([[0], [1], [2], [5], [10]])
+            ),
+        )
+        positions_gradient = TensorBlock(
+            values=create_random_array(7, 3, 2 * i + 1, 5),
+            samples=Labels(
+                ["sample", "structure", "center"],
+                create_int32_array(
+                    [
+                        [0, 0, 1],
+                        [0, 0, 2],
+                        [1, 1, 0],
+                        [1, 1, 1],
+                        [1, 1, 2],
+                        [2, 2, 0],
+                        [3, 3, 0],
+                    ],
+                ),
+            ),
+            components=[
+                Labels(["direction"], create_int32_array([[0], [1], [2]])),
+                Labels(names=["component"], values=np.arange(2 * i + 1).reshape(-1, 1)),
+            ],
+            properties=block.properties,
+        )
+        block.add_gradient("positions", positions_gradient)
+
+        cell_gradient = TensorBlock(
+            values=create_random_array(4, 6, 2 * i + 1, 5),
+            samples=Labels(
+                ["sample", "structure"],
+                create_int32_array([[0, 0], [1, 1], [2, 2], [3, 3]]),
+            ),
+            components=[
+                Labels(
+                    ["direction_xx_yy_zz_yz_xz_xy"],
+                    create_int32_array([[0], [1], [2], [3], [4], [5]]),
+                ),
+                Labels(names=["component"], values=np.arange(2 * i + 1).reshape(-1, 1)),
+            ],
+            properties=block.properties,
+        )
+        block.add_gradient("cell", cell_gradient)
+        blocks.append(block)
+
+    return TensorMap(Labels(names=["key"], values=np.arange(3).reshape(-1, 1)), blocks)
diff --git a/tox.ini b/tox.ini
index 888459668..4b4f6d7d2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -117,6 +117,7 @@ deps =
     {[testenv]packaging_deps}
     {[testenv]testing_deps}
     numpy <2.0
+    skmatter
 
 changedir = python/metatensor-learn
 commands =
@@ -136,6 +137,7 @@ deps =
     {[testenv]packaging_deps}
     {[testenv]testing_deps}
     torch=={env:METATENSOR_TESTS_TORCH_VERSION:2.2.*}
+    skmatter
 
 changedir = python/metatensor-learn
 commands =
@@ -185,6 +187,7 @@ deps =
     numpy <2.0
     torch=={env:METATENSOR_TESTS_TORCH_VERSION:2.2.*}
     ase
+    skmatter
 
 setenv =
     # ignore the fact that metatensor.torch.operations was loaded from a file

From f1dbb06d99b4dd2b4f858cdb000245ecf5360e60 Mon Sep 17 00:00:00 2001
From: Joseph Abbott <joseph.william.abbott@gmail.com>
Date: Wed, 3 Apr 2024 14:14:05 +0200
Subject: [PATCH 2/2] Fix docs tests

---
 .../metatensor-learn/metatensor/learn/selection/__init__.py   | 0
 .../selection/{feature_selection => }/feature_selection.py    | 2 +-
 .../metatensor/learn/selection/feature_selection/__init__.py  | 4 ----
 .../selection/{sample_selection => }/sample_selection.py      | 2 +-
 .../metatensor/learn/selection/sample_selection/__init__.py   | 4 ----
 5 files changed, 2 insertions(+), 10 deletions(-)
 create mode 100644 python/metatensor-learn/metatensor/learn/selection/__init__.py
 rename python/metatensor-learn/metatensor/learn/selection/{feature_selection => }/feature_selection.py (98%)
 delete mode 100644 python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py
 rename python/metatensor-learn/metatensor/learn/selection/{sample_selection => }/sample_selection.py (98%)
 delete mode 100644 python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py

diff --git a/python/metatensor-learn/metatensor/learn/selection/__init__.py b/python/metatensor-learn/metatensor/learn/selection/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/metatensor-learn/metatensor/learn/selection/feature_selection/feature_selection.py b/python/metatensor-learn/metatensor/learn/selection/feature_selection.py
similarity index 98%
rename from python/metatensor-learn/metatensor/learn/selection/feature_selection/feature_selection.py
rename to python/metatensor-learn/metatensor/learn/selection/feature_selection.py
index a82514fb9..2fdd5acad 100644
--- a/python/metatensor-learn/metatensor/learn/selection/feature_selection/feature_selection.py
+++ b/python/metatensor-learn/metatensor/learn/selection/feature_selection.py
@@ -6,7 +6,7 @@
 
 from skmatter._selection import _CUR, _FPS
 
-from .._selection import GreedySelector
+from ._selection import GreedySelector
 
 
 class FPS(GreedySelector):
diff --git a/python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py b/python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py
deleted file mode 100644
index 5d981efb2..000000000
--- a/python/metatensor-learn/metatensor/learn/selection/feature_selection/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .feature_selection import CUR, FPS  # noqa
-
-
-___all__ = ["CUR", "FPS"]
diff --git a/python/metatensor-learn/metatensor/learn/selection/sample_selection/sample_selection.py b/python/metatensor-learn/metatensor/learn/selection/sample_selection.py
similarity index 98%
rename from python/metatensor-learn/metatensor/learn/selection/sample_selection/sample_selection.py
rename to python/metatensor-learn/metatensor/learn/selection/sample_selection.py
index 8276ee7db..b1e72d3e1 100644
--- a/python/metatensor-learn/metatensor/learn/selection/sample_selection/sample_selection.py
+++ b/python/metatensor-learn/metatensor/learn/selection/sample_selection.py
@@ -6,7 +6,7 @@
 
 from skmatter._selection import _CUR, _FPS
 
-from .._selection import GreedySelector
+from ._selection import GreedySelector
 
 
 class FPS(GreedySelector):
diff --git a/python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py b/python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py
deleted file mode 100644
index 8d65c776c..000000000
--- a/python/metatensor-learn/metatensor/learn/selection/sample_selection/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .sample_selection import CUR, FPS  # noqa
-
-
-___all__ = ["CUR", "FPS"]