Skip to content

Commit

Permalink
(chore): good chunk size
Browse files Browse the repository at this point in the history
  • Loading branch information
ilan-gold committed Nov 28, 2024
1 parent 8b95aff commit dee82a2
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 6 deletions.
17 changes: 12 additions & 5 deletions src/anndata/_core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,20 +1096,28 @@ def make_dask_col_from_extension_dtype(

from anndata._io.specs.lazy_methods import (
compute_chunk_layout_for_axis_size,
get_chunksize,
maybe_open_h5,
)
from anndata.experimental import read_lazy
from anndata.experimental import read_elem_lazy
from anndata.experimental.backed._compat import DataArray
from anndata.experimental.backed._compat import xarray as xr

base_path_or_zarr_group = col.attrs.get("base_path_or_zarr_group")
elem_name = col.attrs.get("elem_name")
dims = col.dims
coords = col.coords.copy()
with maybe_open_h5(base_path_or_zarr_group, elem_name) as f:
maybe_chunk_size = get_chunksize(read_elem_lazy(f))
chunk_size = (
compute_chunk_layout_for_axis_size(
1000 if maybe_chunk_size is None else maybe_chunk_size[0], col.shape[0]
),
)

def get_chunk(block_info=None):
with maybe_open_h5(base_path_or_zarr_group, elem_name) as f:
v = read_lazy(f)
v = read_elem_lazy(f)
variable = xr.Variable(
data=xr.core.indexing.LazilyIndexedArray(v), dims=dims
)
Expand All @@ -1128,10 +1136,9 @@ def get_chunk(block_info=None):
dtype = "object"
else:
dtype = col.dtype.numpy_dtype
# TODO: get good chunk size?
return da.map_blocks(
get_chunk,
chunks=(compute_chunk_layout_for_axis_size(1000, col.shape[0]),),
chunks=chunk_size,
meta=np.array([], dtype=dtype),
dtype=dtype,
)
Expand Down Expand Up @@ -1185,7 +1192,7 @@ def get_attrs(annotations: Iterable[Dataset2D]) -> dict:
"""
index_names = np.unique([a.index.name for a in annotations])
assert len(index_names) == 1, "All annotations must have the same index name."
if any(a.index.dtype == "int64" for a in annotations):
if any(np.issubdtype(a.index.dtype, np.integer) for a in annotations):
msg = "Concatenating with a pandas numeric index among the indices. Index may likely not be unique."
warn(msg, UserWarning)
index_keys = [
Expand Down
9 changes: 8 additions & 1 deletion src/anndata/_io/specs/lazy_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import re
from contextlib import contextmanager
from functools import partial
from functools import partial, singledispatch
from pathlib import Path
from typing import TYPE_CHECKING, overload

Expand Down Expand Up @@ -92,6 +92,13 @@ def make_dask_chunk(
return chunk


@singledispatch
def get_chunksize(obj) -> tuple[int, ...]:
if hasattr(obj, "chunks"):
return obj.chunks
raise ValueError("object of type {type(obj)} has no recognized chunks")


@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0"))
@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0"))
@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0"))
Expand Down
12 changes: 12 additions & 0 deletions src/anndata/experimental/backed/_lazy_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from anndata._core.index import _subset
from anndata._core.views import as_view
from anndata._io.specs.lazy_methods import get_chunksize
from anndata.compat import H5Array, ZarrArray

from ..._settings import settings
Expand All @@ -28,6 +29,7 @@

class ZarrOrHDF5Wrapper(ZarrArrayWrapper, Generic[K]):
def __init__(self, array: K):
self.chunks = array.chunks
if isinstance(array, ZarrArray):
return super().__init__(array)
self._array = array
Expand Down Expand Up @@ -152,3 +154,13 @@ def _subset_masked(a: DataArray, subset_idx: Index):
@as_view.register(DataArray)
def _view_pd_boolean_array(a: DataArray, view_args):
return a


@get_chunksize.register(MaskedArray)
def _(a: MaskedArray):
return get_chunksize(a._values)


@get_chunksize.register(CategoricalArray)
def _(a: CategoricalArray):
return get_chunksize(a._codes)

0 comments on commit dee82a2

Please sign in to comment.