Fix anndata warnings (#2779)

scverse · Dec 15, 2023 · 86dc4d5 · 86dc4d5
1 parent 737da3a
commit 86dc4d5
Show file tree

Hide file tree

Showing 16 changed files with 95 additions and 40 deletions.
diff --git a/docs/release-notes/1.9.7.md b/docs/release-notes/1.9.7.md
@@ -6,3 +6,4 @@
 - Specify correct version of `matplotlib` dependency {pr}`2733` {smaller}`P Fisher`
 - Fix {func}`scanpy.pl.violin` usage of `seaborn.catplot` {pr}`2739` {smaller}`E Roellin`
 - Fix {func}`scanpy.pp.highly_variable_genes` to handle the combinations of `inplace` and `subset` consistently {pr}`2757` {smaller}`E Roellin`
+- Replace usage of various deprecated functionality from {mod}`anndata` and {mod}`pandas` {pr}`2678` {pr}`2779` {smaller}`P Angerer`
diff --git a/pyproject.toml b/pyproject.toml
@@ -167,6 +167,15 @@ markers = [
     "internet: tests which rely on internet resources (enable with `--internet-tests`)",
     "gpu: tests that use a GPU (currently unused, but needs to be specified here as we import anndata.tests.helpers, which uses it)",
 ]
+filterwarnings = [
+    # When calling `.show()` in tests, this is raised
+    "ignore:FigureCanvasAgg is non-interactive:UserWarning",
+    # We explicitly handle these errors in tests
+    "error:`anndata.read` is deprecated:FutureWarning",
+    "error:Observation names are not unique:UserWarning",
+    "error:The dtype argument is deprecated and will be removed:FutureWarning",
+    "error:The behavior of DataFrame\\.sum with axis=None is deprecated:FutureWarning",
+]
 
 [tool.coverage.run]
 data_file = "test-data/coverage"
@@ -176,6 +185,11 @@ omit = ["*/tests/*"]
 output = "test-data/coverage.xml"
 [tool.coverage.paths]
 source = [".", "**/site-packages"]
+[tool.coverage.report]
+exclude_also = [
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+]
 
 [tool.ruff]
 select = [

diff --git a/scanpy/datasets/_ebi_expression_atlas.py b/scanpy/datasets/_ebi_expression_atlas.py
@@ -123,7 +123,7 @@ def ebi_expression_atlas(
     experiment_dir = settings.datasetdir / accession
     dataset_path = experiment_dir / f"{accession}.h5ad"
     try:
-        adata = anndata.read(dataset_path)
+        adata = anndata.read_h5ad(dataset_path)
         if filter_boring:
             adata.obs = _filter_boring(adata.obs)
         return adata

diff --git a/scanpy/external/pp/_bbknn.py b/scanpy/external/pp/_bbknn.py
@@ -70,18 +70,17 @@ def bbknn(
         PyNNDescent supports metrics listed in `pynndescent.distances.named_distances`
         and custom functions, including compiled Numba code.
 
-        >>> pynndescent.distances.named_distances.keys()
+        >>> import pynndescent
+        >>> pynndescent.distances.named_distances.keys()  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
         dict_keys(['euclidean', 'l2', 'sqeuclidean', 'manhattan', 'taxicab', 'l1', 'chebyshev', 'linfinity',
-        'linfty', 'linf', 'minkowski', 'seuclidean', 'standardised_euclidean', 'wminkowski', 'weighted_minkowski',
-        'mahalanobis', 'canberra', 'cosine', 'dot', 'correlation', 'hellinger', 'haversine', 'braycurtis', 'spearmanr',
-        'kantorovich', 'wasserstein', 'tsss', 'true_angular', 'hamming', 'jaccard', 'dice', 'matching', 'kulsinski',
-        'rogerstanimoto', 'russellrao', 'sokalsneath', 'sokalmichener', 'yule'])
+        'linfty', 'linf', 'minkowski', 'seuclidean', 'standardised_euclidean', 'wminkowski', ...])
 
         KDTree supports members of :class:`sklearn.neighbors.KDTree`’s ``valid_metrics`` list, or parameterised
         :class:`~sklearn.metrics.DistanceMetric` objects:
 
+        >>> import sklearn.neighbors
         >>> sklearn.neighbors.KDTree.valid_metrics
-        ['p', 'chebyshev', 'cityblock', 'minkowski', 'infinity', 'l2', 'euclidean', 'manhattan', 'l1']
+        ['euclidean', 'l2', 'minkowski', 'p', 'manhattan', 'cityblock', 'l1', 'chebyshev', 'infinity']
 
         .. note:: check the relevant documentation for up-to-date lists.
     copy

diff --git a/scanpy/external/pp/_hashsolo.py b/scanpy/external/pp/_hashsolo.py
@@ -324,7 +324,7 @@ def hashsolo(
     -------
     >>> import anndata
     >>> import scanpy.external as sce
-    >>> adata = anndata.read("data.h5ad")
+    >>> adata = anndata.read_h5ad("data.h5ad")
     >>> sce.pp.hashsolo(adata, ['Hash1', 'Hash2', 'Hash3'])
     >>> adata.obs.head()
     """

diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py
@@ -96,6 +96,9 @@ def filter_cells(
     --------
     >>> import scanpy as sc
     >>> adata = sc.datasets.krumsiek11()
+    UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
+        utils.warn_names_duplicates("obs")
+    >>> adata.obs_names_make_unique()
     >>> adata.n_obs
     640
     >>> adata.var_names.tolist()  # doctest: +NORMALIZE_WHITESPACE

diff --git a/scanpy/readwrite.py b/scanpy/readwrite.py
@@ -14,12 +14,12 @@
     AnnData,
     read_csv,
     read_excel,
+    read_h5ad,
     read_hdf,
     read_loom,
     read_mtx,
     read_text,
 )
-from anndata import read as read_h5ad
 from matplotlib.image import imread
 
 from . import logging as logg
@@ -904,7 +904,7 @@ def _read_softgz(filename: str | bytes | Path | BinaryIO) -> AnnData:
     X = np.array(X).T
     obs = pd.DataFrame({"groups": groups}, index=sample_names)
     var = pd.DataFrame(index=gene_names)
-    return AnnData(X=X, obs=obs, var=var, dtype=X.dtype)
+    return AnnData(X=X, obs=obs, var=var)
 
 
 # -------------------------------------------------------------------------------

diff --git a/scanpy/testing/_helpers/data.py b/scanpy/testing/_helpers/data.py
@@ -5,6 +5,8 @@
 
 from __future__ import annotations
 
+import warnings
+
 try:
     from functools import cache
 except ImportError:  # Python < 3.9
@@ -47,7 +49,11 @@ def pbmc68k_reduced() -> AnnData:
 
 
 def krumsiek11() -> AnnData:
-    return _krumsiek11().copy()
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", "Observation names are not unique", module="anndata"
+        )
+        return _krumsiek11().copy()
 
 
 def paul15() -> AnnData:

diff --git a/scanpy/testing/_pytest/fixtures/__init__.py b/scanpy/testing/_pytest/fixtures/__init__.py
@@ -4,6 +4,7 @@
 """
 from __future__ import annotations
 
+import warnings
 from typing import TYPE_CHECKING
 
 import numpy as np
@@ -16,6 +17,7 @@
 )
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
     from pathlib import Path
 
 __all__ = [
@@ -33,11 +35,31 @@ def float_dtype(request):
 
 
 @pytest.fixture()
-def doctest_env(cache: pytest.Cache, tmp_path: Path) -> None:
+def doctest_env(cache: pytest.Cache, tmp_path: Path) -> Generator[None, None, None]:
     from scanpy import settings
     from scanpy._compat import chdir
 
+    showwarning_orig = warnings.showwarning
+
+    def showwarning(message, category, filename, lineno, file=None, line=None):
+        if file is None:
+            if line is None:
+                import linecache
+
+                line = linecache.getline(filename, lineno)
+            line = line.strip()
+            print(f"{category.__name__}: {message}\n    {line}")
+        else:
+            showwarning_orig(message, category, filename, lineno, file, line)
+
+    # make errors visible and the rest ignored
+    warnings.filters = [
+        ("default", *rest) for action, *rest in warnings.filters if action == "error"
+    ] + [("ignore", None, Warning, None, 0)]
+
+    warnings.showwarning = showwarning
     old_dd, settings.datasetdir = settings.datasetdir, cache.mkdir("scanpy-data")
     with chdir(tmp_path):
         yield
+    warnings.showwarning = showwarning_orig
     settings.datasetdir = old_dd
diff --git a/scanpy/tests/test_combat.py b/scanpy/tests/test_combat.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+import pytest
 from anndata.tests.helpers import assert_equal
 from sklearn.metrics import silhouette_score
 
@@ -59,8 +60,9 @@ def test_combat_obs_names():
         {"batch": pd.Categorical(np.random.randint(0, 2, 200))},
         index=np.repeat(np.arange(100), 2).astype(str),  # Non-unique index
     )
-    a = sc.AnnData(X, obs)
-    b = a.copy()
+    with pytest.warns(UserWarning, match="Observation names are not unique"):
+        a = sc.AnnData(X, obs)
+        b = a.copy()
     b.obs_names_make_unique()
 
     sc.pp.combat(a, "batch")

diff --git a/scanpy/tests/test_datasets.py b/scanpy/tests/test_datasets.py
@@ -64,7 +64,8 @@ def test_ebi_expression_atlas(tmp_dataset_dir):
 
 
 def test_krumsiek11(tmp_dataset_dir):
-    adata = sc.datasets.krumsiek11()
+    with pytest.warns(UserWarning, match=r"Observation names are not unique"):
+        adata = sc.datasets.krumsiek11()
     assert adata.shape == (640, 11)
     assert all(
         np.unique(adata.obs["cell_type"])
@@ -80,7 +81,8 @@ def test_blobs():
 
 
 def test_toggleswitch():
-    sc.datasets.toggleswitch()
+    with pytest.warns(UserWarning, match=r"Observation names are not unique"):
+        sc.datasets.toggleswitch()
 
 
 def test_pbmc68k_reduced():

diff --git a/scanpy/tests/test_get.py b/scanpy/tests/test_get.py
@@ -13,19 +13,20 @@
 from scanpy.datasets._utils import filter_oldformatwarning
 from scanpy.testing._helpers.data import pbmc68k_reduced
 
+
+# Override so warning gets caught
+def transpose_adata(adata: AnnData, *, expect_duplicates: bool = False) -> AnnData:
+    if not expect_duplicates:
+        return adata.T
+    with pytest.warns(UserWarning, match=r"Observation names are not unique"):
+        return adata.T
+
+
 TRANSPOSE_PARAMS = pytest.mark.parametrize(
     "dim,transform,func",
     [
-        (
-            "obs",
-            lambda x: x,
-            sc.get.obs_df,
-        ),
-        (
-            "var",
-            lambda x: x.T,
-            sc.get.var_df,
-        ),
+        ("obs", lambda x, expect_duplicates=False: x, sc.get.obs_df),
+        ("var", transpose_adata, sc.get.var_df),
     ],
     ids=["obs_df", "var_df"],
 )
@@ -140,11 +141,12 @@ def test_obs_df(adata):
     assert all(badkey_err.match(k) for k in badkeys)
 
     # test non unique index
-    adata = sc.AnnData(
-        np.arange(16).reshape(4, 4),
-        obs=pd.DataFrame(index=["a", "a", "b", "c"]),
-        var=pd.DataFrame(index=[f"gene{i}" for i in range(4)]),
-    )
+    with pytest.warns(UserWarning, match=r"Observation names are not unique"):
+        adata = sc.AnnData(
+            np.arange(16).reshape(4, 4),
+            obs=pd.DataFrame(index=["a", "a", "b", "c"]),
+            var=pd.DataFrame(index=[f"gene{i}" for i in range(4)]),
+        )
     df = sc.get.obs_df(adata, ["gene1"])
     pd.testing.assert_index_equal(df.index, adata.obs_names)
 
@@ -373,16 +375,18 @@ def test_repeated_cols(dim, transform, func):
 
 @TRANSPOSE_PARAMS
 def test_repeated_index_vals(dim, transform, func):
-    # THis one could be reverted, see:
+    # This one could be reverted, see:
     # https://github.com/scverse/scanpy/pull/1583#issuecomment-770641710
     alt_dim = ["obs", "var"][dim == "obs"]
+
     adata = transform(
         sc.AnnData(
             np.ones((5, 10)),
             var=pd.DataFrame(
                 index=["repeated_id"] * 2 + [f"gene-{i}" for i in range(8)]
             ),
-        )
+        ),
+        expect_duplicates=True,
     )
 
     with pytest.raises(

diff --git a/scanpy/tests/test_rank_genes_groups.py b/scanpy/tests/test_rank_genes_groups.py
@@ -215,15 +215,15 @@ def test_emptycat():
 
 def test_log1p_save_restore(tmp_path):
     """tests the sequence log1p→save→load→rank_genes_groups"""
-    from anndata import read
+    from anndata import read_h5ad
 
     pbmc = pbmc68k_reduced()
     sc.pp.log1p(pbmc)
 
     path = tmp_path / "test.h5ad"
     pbmc.write(path)
 
-    pbmc = read(path)
+    pbmc = read_h5ad(path)
 
     sc.tl.rank_genes_groups(pbmc, groupby="bulk_labels", use_raw=True)
 

diff --git a/scanpy/tests/test_scaling.py b/scanpy/tests/test_scaling.py
@@ -99,7 +99,7 @@ def test_scale(typ, dtype, mask, X, X_centered, X_scaled):
 def test_mask_string():
     with pytest.raises(ValueError):
         sc.pp.scale(np.array(X_original), mask="mask")
-    adata = AnnData(np.array(X_for_mask), dtype="float32")
+    adata = AnnData(np.array(X_for_mask, dtype="float32"))
     adata.obs["some cells"] = np.array((0, 0, 1, 1, 1, 0, 0), dtype=bool)
     sc.pp.scale(adata, mask="some cells")
     assert np.array_equal(adata.X, X_centered_for_mask)

diff --git a/scanpy/tests/test_sim.py b/scanpy/tests/test_sim.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 import numpy as np
+import pytest
 
 import scanpy as sc
 
 
 def test_sim_toggleswitch(tmp_write_dir):
-    adata = sc.tl.sim("toggleswitch")
-    np.allclose(adata.X, sc.datasets.toggleswitch().X, np.finfo(np.float32).eps)
+    with pytest.warns(UserWarning, match=r"Observation names are not unique"):
+        adata = sc.tl.sim("toggleswitch")
+        np.allclose(adata.X, sc.datasets.toggleswitch().X, np.finfo(np.float32).eps)
diff --git a/scanpy/tools/_rank_genes_groups.py b/scanpy/tools/_rank_genes_groups.py
@@ -294,7 +294,7 @@ def wilcoxon(
 
                 # Calculate rank sums for each chunk for the current mask
                 for ranks, left, right in _ranks(self.X, mask, mask_rest):
-                    scores[left:right] = np.sum(ranks.iloc[0:n_active, :])
+                    scores[left:right] = ranks.iloc[0:n_active, :].sum(axis=0)
                     if tie_correct:
                         T[left:right] = _tiecorrect(ranks)
 
@@ -322,7 +322,7 @@ def wilcoxon(
             for ranks, left, right in _ranks(self.X):
                 # sum up adjusted_ranks to calculate W_m,n
                 for imask, mask in enumerate(self.groups_masks):
-                    scores[imask, left:right] = np.sum(ranks.iloc[mask, :])
+                    scores[imask, left:right] = ranks.iloc[mask, :].sum(axis=0)
                     if tie_correct:
                         T[imask, left:right] = _tiecorrect(ranks)