diff --git a/.gitignore b/.gitignore index 88e01a343..4053f744f 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ test.h5ad # IDEs /.idea/ +/.vscode/ diff --git a/anndata/__init__.py b/anndata/__init__.py index ad7e4c68b..ba7861dad 100644 --- a/anndata/__init__.py +++ b/anndata/__init__.py @@ -18,7 +18,12 @@ read_mtx, read_zarr, ) - from ._warnings import OldFormatWarning, WriteWarning, ImplicitModificationWarning + from ._warnings import ( + OldFormatWarning, + WriteWarning, + ImplicitModificationWarning, + ExperimentalFeatureWarning, + ) # backwards compat / shortcut for default format from ._io import read_h5ad as read diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 2c8430794..b99325727 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -1,18 +1,22 @@ from abc import ABC, abstractmethod from collections import abc as cabc +from copy import copy from typing import Union, Optional, Type, ClassVar, TypeVar # Special types from typing import Iterator, Mapping, Sequence # ABCs from typing import Tuple, List, Dict # Generic base types +import warnings import numpy as np import pandas as pd from scipy.sparse import spmatrix -from ..utils import deprecated, ensure_df_homogeneous +from ..utils import deprecated, ensure_df_homogeneous, dim_len from . import raw, anndata from .views import as_view from .access import ElementRef from .index import _subset +from anndata.compat import AwkArray +from anndata._warnings import ExperimentalFeatureWarning OneDIdx = Union[Sequence[int], Sequence[bool], slice] @@ -46,15 +50,37 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" + if isinstance(val, AwkArray): + warnings.warn( + "Support for Awkward Arrays is currently experimental. " + "Behavior may change in the future. Please report any issues you may encounter!", + ExperimentalFeatureWarning, + # stacklevel=3, + ) + # Prevent from showing up every time an awkward array is used + # You'd think `once` works, but it doesn't at the repl and in notebooks + warnings.filterwarnings( + "ignore", + category=ExperimentalFeatureWarning, + message="Support for Awkward Arrays is currently experimental.*", + ) for i, axis in enumerate(self.axes): - if self.parent.shape[axis] != val.shape[i]: + if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) - raise ValueError( - f"Value passed for key {key!r} is of incorrect shape. " - f"Values of {self.attrname} must match dimensions " - f"{self.axes} of parent. Value had shape {val.shape} while " - f"it should have had {right_shape}." - ) + actual_shape = tuple(dim_len(val, a) for a, _ in enumerate(self.axes)) + if actual_shape[i] is None and isinstance(val, AwkArray): + raise ValueError( + f"The AwkwardArray is of variable length in dimension {i}.", + f"Try ak.to_regular(array, {i}) before including the array in AnnData", + ) + else: + raise ValueError( + f"Value passed for key {key!r} is of incorrect shape. " + f"Values of {self.attrname} must match dimensions " + f"{self.axes} of parent. Value had shape {actual_shape} while " + f"it should have had {right_shape}." + ) + if not self._allow_df and isinstance(val, pd.DataFrame): name = self.attrname.title().rstrip("s") val = ensure_df_homogeneous(val, f"{name} {key!r}") @@ -84,7 +110,11 @@ def parent(self) -> Union["anndata.AnnData", "raw.Raw"]: def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): - d[k] = v.copy() + if isinstance(v, AwkArray): + # Shallow copy since awkward array buffers are immutable + d[k] = copy(v) + else: + d[k] = v.copy() return d def _view(self, parent: "anndata.AnnData", subset_idx: I): diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 76033f23a..69dd0e159 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -45,7 +45,7 @@ ) from .sparse_dataset import SparseDataset from .. import utils -from ..utils import convert_to_dict, ensure_df_homogeneous +from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len from ..logging import anndata_logger as logger from ..compat import ( ZarrArray, @@ -55,6 +55,7 @@ _move_adj_mtx, _overloaded_uns, OverloadedDict, + AwkArray, ) @@ -1861,7 +1862,7 @@ def _check_dimensions(self, key=None): if "obsm" in key: obsm = self._obsm if ( - not all([o.shape[0] == self._n_obs for o in obsm.values()]) + not all([dim_len(o, 0) == self._n_obs for o in obsm.values()]) and len(obsm.dim_names) != self._n_obs ): raise ValueError( @@ -1871,7 +1872,7 @@ def _check_dimensions(self, key=None): if "varm" in key: varm = self._varm if ( - not all([v.shape[0] == self._n_vars for v in varm.values()]) + not all([dim_len(v, 0) == self._n_vars for v in varm.values()]) and len(varm.dim_names) != self._n_vars ): raise ValueError( diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py index ce161335d..f7f42bb3b 100644 --- a/anndata/_core/file_backing.py +++ b/anndata/_core/file_backing.py @@ -8,7 +8,7 @@ from . import anndata from .sparse_dataset import SparseDataset -from ..compat import ZarrArray, DaskArray +from ..compat import ZarrArray, DaskArray, AwkArray class AnnDataFileManager: @@ -123,3 +123,13 @@ def _(x, copy=True): @to_memory.register(Mapping) def _(x: Mapping, copy=True): return {k: to_memory(v, copy=copy) for k, v in x.items()} + + +@to_memory.register(AwkArray) +def _(x, copy=True): + from copy import copy + + if copy: + return copy(x) + else: + return x diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 212398058..859c1bcdd 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix, issparse -from ..compat import DaskArray, Index, Index1D +from ..compat import AwkArray, DaskArray, Index, Index1D def _normalize_indices( @@ -145,6 +145,13 @@ def _subset_df(df: pd.DataFrame, subset_idx: Index): return df.iloc[subset_idx] +@_subset.register(AwkArray) +def _subset_awkarray(a: AwkArray, subset_idx: Index): + if all(isinstance(x, cabc.Iterable) for x in subset_idx): + subset_idx = np.ix_(*subset_idx) + return a[subset_idx] + + # Registration for SparseDataset occurs in sparse_dataset.py @_subset.register(h5py.Dataset) def _subset_dataset(d, subset_idx): diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 5bd3856b0..0f44c2326 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -18,7 +18,7 @@ Literal, ) import typing -from warnings import warn +from warnings import warn, filterwarnings from natsort import natsorted import numpy as np @@ -27,9 +27,10 @@ from scipy.sparse import spmatrix from .anndata import AnnData -from ..utils import asarray -from ..compat import DaskArray +from ..compat import AwkArray, DaskArray +from ..utils import asarray, dim_len from .index import _subset, make_slice +from anndata._warnings import ExperimentalFeatureWarning T = TypeVar("T") @@ -154,6 +155,13 @@ def equal_sparse(a, b) -> bool: return False +@equal.register(AwkArray) +def equal_awkward(a, b) -> bool: + from ..compat import awkward as ak + + return ak.almost_equal(a, b) + + def as_sparse(x): if not isinstance(x, sparse.spmatrix): return sparse.csr_matrix(x) @@ -366,12 +374,14 @@ def apply(self, el, *, axis, fill_value=None): Missing values are to be replaced with `fill_value`. """ - if self.no_change and (el.shape[axis] == len(self.old_idx)): + if self.no_change and (dim_len(el, axis) == len(self.old_idx)): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) elif isinstance(el, sparse.spmatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) + elif isinstance(el, AwkArray): + return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) elif isinstance(el, DaskArray): return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value) else: @@ -468,6 +478,22 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: return out + def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): + import awkward as ak + + if self.no_change: + return el + elif axis == 1: # Indexing by field + if self.new_idx.isin(self.old_idx).all(): # inner join + return el[self.new_idx] + else: # outer join + # TODO: this code isn't actually hit, we should refactor + raise Exception("This should be unreachable, please open an issue.") + else: + if len(self.new_idx) > len(self.old_idx): + el = ak.pad_none(el, 1, axis=axis) # axis == 0 + return el[self.old_idx.get_indexer(self.new_idx)] + def merge_indices( inds: Iterable[pd.Index], join: Literal["inner", "outer"] @@ -534,6 +560,17 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ) df.index = index return df + elif any(isinstance(a, AwkArray) for a in arrays): + from ..compat import awkward as ak + + if not all( + isinstance(a, AwkArray) or a is MissingVal or 0 in a.shape for a in arrays + ): + raise NotImplementedError( + "Cannot concatenate an AwkwardArray with other array types." + ) + + return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)], axis=axis) elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( @@ -579,6 +616,15 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): lambda x, y: x.intersection(y), (df_indices(el) for el in els) ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] + elif any(isinstance(el, AwkArray) for el in els if not_missing(el)): + if not all(isinstance(el, AwkArray) for el in els if not_missing(el)): + raise NotImplementedError( + "Cannot concatenate an AwkwardArray with other array types." + ) + common_keys = intersect_keys(el.fields for el in els) + reindexers = [ + Reindexer(pd.Index(el.fields), pd.Index(list(common_keys))) for el in els + ] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ @@ -596,10 +642,38 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): else (lambda x: pd.DataFrame(index=range(shape))) for el, shape in zip(els, shapes) ] - else: - # if fill_value is None: - # fill_value = default_fill_value(els) + elif any(isinstance(el, AwkArray) for el in els if not_missing(el)): + import awkward as ak + if not all(isinstance(el, AwkArray) for el in els if not_missing(el)): + raise NotImplementedError( + "Cannot concatenate an AwkwardArray with other array types." + ) + warn( + "Outer joins on awkward.Arrays will have different return values in the future." + "For details, and to offer input, please see:\n\n\t" + "https://github.com/scverse/anndata/issues/898", + ExperimentalFeatureWarning, + ) + filterwarnings( + "ignore", + category=ExperimentalFeatureWarning, + message=r"Outer joins on awkward.Arrays will have different return values.*", + ) + # all_keys = union_keys(el.fields for el in els if not_missing(el)) + reindexers = [] + for el in els: + if not_missing(el): + reindexers.append(lambda x: x) + else: + reindexers.append( + lambda x: ak.pad_none( + ak.Array([]), + len(x), + 0, + ) + ) + else: max_col = max(el.shape[1] for el in els if not_missing(el)) orig_cols = [el.shape[1] if not_missing(el) else 0 for el in els] reindexers = [ diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 5b8433053..514337f83 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -1,5 +1,6 @@ from contextlib import contextmanager from copy import deepcopy +from enum import Enum from functools import reduce, singledispatch, wraps from typing import Any, KeysView, Optional, Sequence, Tuple import warnings @@ -12,7 +13,7 @@ import anndata from anndata._warnings import ImplicitModificationWarning from .access import ElementRef -from ..compat import ZappyArray, DaskArray +from ..compat import ZappyArray, AwkArray, DaskArray class _SetItemMixin: @@ -198,6 +199,67 @@ def as_view_zappy(z, view_args): return z +try: + from ..compat import awkward as ak + import weakref + + # Registry to store weak references from AwkwardArrayViews to their parent AnnData container + _registry = weakref.WeakValueDictionary() + _PARAM_NAME = "_view_args" + + class AwkwardArrayView(_ViewMixin, AwkArray): + @property + def _view_args(self): + """Override _view_args to retrieve the values from awkward arrays parameters. + + Awkward arrays cannot be subclassed like other python objects. Instead subclasses need + to be attached as "behavior". These "behaviors" cannot take any additional parameters (as we do + for other data types to store `_view_args`). Therefore, we need to store `_view_args` using awkward's + parameter mechanism. These parameters need to be json-serializable, which is why we can't store + ElementRef directly, but need to replace the reference to the parent AnnDataView container with a weak + reference. + """ + parent_key, attrname, keys = self.layout.parameter(_PARAM_NAME) + parent = _registry[parent_key] + return ElementRef(parent, attrname, keys) + + def __copy__(self) -> AwkArray: + """ + Turn the AwkwardArrayView into an actual AwkwardArray with no special behavior. + + Need to override __copy__ instead of `.copy()` as awkward arrays don't implement `.copy()` + and are copied using python's standard copy mechanism in `aligned_mapping.py`. + """ + array = self + # makes a shallow copy and removes the reference to the original AnnData object + array = ak.with_parameter(self, _PARAM_NAME, None) + array = ak.with_parameter(array, "__array__", None) + return array + + @as_view.register(AwkArray) + def as_view_awkarray(array, view_args): + parent, attrname, keys = view_args + parent_key = f"target-{id(parent)}" + _registry[parent_key] = parent + # TODO: See https://github.com/scverse/anndata/pull/647#discussion_r963494798_ for more details and + # possible strategies to stack behaviors. + # A better solution might be based on xarray-style "attrs", once this is implemented + # https://github.com/scikit-hep/awkward/issues/1391#issuecomment-1412297114 + if type(array).__name__ != "Array": + raise NotImplementedError( + "Cannot create a view of an awkward array with __array__ parameter. " + "Please open an issue in the AnnData repo and describe your use-case." + ) + array = ak.with_parameter(array, _PARAM_NAME, (parent_key, attrname, keys)) + array = ak.with_parameter(array, "__array__", "AwkwardArrayView") + return array + + ak.behavior["AwkwardArrayView"] = AwkwardArrayView + +except ImportError: + pass + + def _resolve_idxs(old, new, adata): t = tuple(_resolve_idx(old[i], new[i], adata.shape[i]) for i in (0, 1)) return t diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 5f6d64f13..e1789a987 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -29,6 +29,7 @@ ) from anndata._io.utils import report_write_key_on_error, check_key, H5PY_V3 from anndata._warnings import OldFormatWarning +from anndata.compat import AwkArray from .registry import ( _REGISTRY, @@ -494,6 +495,42 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) return SparseDataset(elem)[indices] +################# +# Awkward array # +################# + + +@_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_write( + H5Group, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") +) +@_REGISTRY.register_write( + ZarrGroup, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") +) +def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): + from anndata.compat import awkward as ak + + group = f.create_group(k) + form, length, container = ak.to_buffers(ak.to_packed(v)) + group.attrs["length"] = length + group.attrs["form"] = form.to_json() + for k, v in container.items(): + write_elem(group, k, v, dataset_kwargs=dataset_kwargs) + + +@_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) +def read_awkward(elem): + from anndata.compat import awkward as ak + + form = _read_attr(elem.attrs, "form") + length = _read_attr(elem.attrs, "length") + container = {k: read_elem(elem[k]) for k in elem.keys()} + + return ak.from_buffers(form, length, container) + + ############## # DataFrames # ############## diff --git a/anndata/_warnings.py b/anndata/_warnings.py index 9409f6aab..5bc0c461c 100644 --- a/anndata/_warnings.py +++ b/anndata/_warnings.py @@ -21,3 +21,9 @@ class ImplicitModificationWarning(UserWarning): """ pass + + +class ExperimentalFeatureWarning(Warning): + """Raised when an unstable experimental feature is used.""" + + pass diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 652ab65f0..1bedd3493 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -42,6 +42,19 @@ def __repr__(): return "mock zarr.core.Group" +try: + import awkward + + AwkArray = awkward.Array + +except ImportError: + + class AwkArray: + @staticmethod + def __repr__(): + return "mock awkward.highlevel.Array" + + try: from zappy.base import ZappyArray except ImportError: diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index dd91d9cdf..bcd607ce8 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -10,13 +10,16 @@ from pandas.api.types import is_numeric_dtype import pytest from scipy import sparse +from anndata.compat import awkward as ak +import random from anndata import AnnData, Raw from anndata._core.views import ArrayView from anndata._core.sparse_dataset import SparseDataset from anndata._core.aligned_mapping import AlignedMapping -from anndata.utils import asarray -from anndata.compat import DaskArray +from anndata.utils import asarray, dim_len + +from anndata.compat import AwkArray, DaskArray # Give this to gen_adata when dask array support is expected. GEN_ADATA_DASK_ARGS = dict( @@ -68,6 +71,63 @@ def gen_typed_df(n, index=None): ) +def _gen_awkward_inner(shape, rng, dtype): + # the maximum length a ragged dimension can take + MAX_RAGGED_DIM_LEN = 20 + if not len(shape): + # abort condition -> no dimension left, return an actual value instead + return dtype(rng.randrange(1000)) + else: + curr_dim_len = shape[0] + lil = [] + if curr_dim_len is None: + # ragged dimension, set random length + curr_dim_len = rng.randrange(MAX_RAGGED_DIM_LEN) + + for _ in range(curr_dim_len): + lil.append(_gen_awkward_inner(shape[1:], rng, dtype)) + + return lil + + +def gen_awkward(shape, dtype=np.int32): + """Function to generate an awkward array with random values. + + Awkward array dimensions can either be fixed-length ("regular") or variable length ("ragged") + (the first dimension is always fixed-length). + + + Parameters + ---------- + shape + shape of the array to be generated. Any dimension specified as `None` will be simulated as ragged. + """ + if shape[0] is None: + raise ValueError("The first dimension must be fixed-length.") + + rng = random.Random(123) + shape = np.array(shape) + + if np.any(shape == 0): + # use empty numpy array for fixed dimensions, then add empty singletons for ragged dimensions + var_dims = [i for i, s in enumerate(shape) if s is None] + shape = [s for s in shape if s is not None] + arr = ak.Array(np.empty(shape, dtype=dtype)) + for d in var_dims: + arr = ak.singletons(arr, axis=d - 1) + return arr + else: + lil = _gen_awkward_inner(shape, rng, dtype) + arr = ak.values_astype(AwkArray(lil), dtype) + + # make fixed-length dimensions regular + for i, d in enumerate(shape): + if d is not None: + arr = ak.to_regular(arr, i) + + return arr + + def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: s = 0 df = pd.DataFrame() @@ -90,8 +150,18 @@ def gen_adata( X_dtype=np.float32, # obs_dtypes, # var_dtypes, - obsm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), - varm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), + obsm_types: "Collection[Type]" = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + AwkArray, + ), + varm_types: "Collection[Type]" = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + AwkArray, + ), layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), ) -> AnnData: """\ @@ -136,6 +206,7 @@ def gen_adata( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), + awk_2d_ragged=gen_awkward((M, None)), da=da.random.random((M, 50)), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} @@ -143,6 +214,7 @@ def gen_adata( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), + awk_2d_ragged=gen_awkward((N, None)), da=da.random.random((N, 50)), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} @@ -166,6 +238,8 @@ def gen_adata( scalar_float=3.0, nested_further=dict(array=np.arange(5)), ), + awkward_regular=gen_awkward((10, 5)), + awkward_ragged=gen_awkward((12, None, None)), # U_recarray=gen_vstr_recarray(N, 5, "U4") ) adata = AnnData( @@ -366,6 +440,15 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): ) +@assert_equal.register(AwkArray) +def assert_equal_awkarray(a, b, exact=False, elem_name=None): + from anndata.compat import awkward as ak + + if exact: + assert a.type == b.type, format_msg(elem_name) + assert ak.to_list(a) == ak.to_list(b), format_msg(elem_name) + + @assert_equal.register(Mapping) def assert_equal_mapping(a, b, exact=False, elem_name=None): assert set(a.keys()) == set(b.keys()), format_msg(elem_name) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py new file mode 100644 index 000000000..6df635fa1 --- /dev/null +++ b/anndata/tests/test_awkward.py @@ -0,0 +1,346 @@ +"""Tests related to awkward arrays""" +import pytest +import numpy as np +import numpy.testing as npt + +from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward +from anndata.compat import awkward as ak +from anndata import ImplicitModificationWarning +from anndata.utils import dim_len +from anndata import AnnData, read_h5ad +import anndata +import pandas as pd + + +@pytest.mark.parametrize( + "array,shape", + [ + # numpy array + [ak.Array(np.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))), (2, 3, 4, 5)], + # record + [ak.Array([{"a": 1, "b": 2}, {"a": 1, "b": 3}]), (2,)], + # ListType, variable length + [ak.Array([[1], [2, 3], [4, 5, 6]]), (3, None)], + # ListType, happens to have the same length, but is not regular + [ak.Array([[2], [3], [4]]), (3, None)], + # RegularType + nested ListType + [ak.to_regular(ak.Array([[[1, 2], [3]], [[2], [3, 4, 5]]]), 1), (2, 2, None)], + # nested record + [ + ak.to_regular(ak.Array([[{"a": 0}, {"b": 1}], [{"c": 2}, {"d": 3}]]), 1), + (2, 2), + ], + # mixed types (variable length) + [ak.Array([[1, 2], ["a"]]), (2, None)], + # mixed types (but regular) + [ak.to_regular(ak.Array([[1, 2], ["a", "b"]]), 1), (2, 2)], + # zero-size edge cases + [ak.Array(np.ones((0, 7))), (0, 7)], + [ak.Array(np.ones((7, 0))), (7, 0)], + # UnionType of two regular types with different dimensions + [ + ak.concatenate([ak.Array(np.ones((2, 2))), ak.Array(np.ones((2, 3)))]), + (4, None), + ], + # UnionType of two regular types with same dimension + [ + ak.concatenate( + [ + ak.Array(np.ones((2, 2))), + ak.Array(np.array([["a", "a"], ["a", "a"]])), + ] + ), + (4, 2), + ], + # Array of string types + [ak.Array(["a", "b", "c"]), (3,)], + [ak.Array([["a", "b"], ["c", "d"], ["e", "f"]]), (3, None)], + [ak.to_regular(ak.Array([["a", "b"], ["c", "d"], ["e", "f"]]), 1), (3, 2)], + ], +) +def test_dim_len(array, shape): + """Test that dim_len returns the right value for awkward arrays.""" + for axis, size in enumerate(shape): + assert size == dim_len(array, axis) + + # Requesting the size for an axis higher than the array has dimensions should raise a TypeError + with pytest.raises(TypeError): + dim_len(array, len(shape)) + + +@pytest.mark.parametrize( + "field,value,valid", + [ + ["obsm", gen_awkward((10, 5)), True], + ["obsm", gen_awkward((10, None)), True], + ["obsm", gen_awkward((10, None, None)), True], + ["obsm", gen_awkward((10, 5, None)), True], + ["obsm", gen_awkward((8, 10)), False], + ["obsm", gen_awkward((8, None)), False], + ["varm", gen_awkward((20, 5)), True], + ["varm", gen_awkward((20, None)), True], + ["varm", gen_awkward((20, None, None)), True], + ["varm", gen_awkward((20, 5, None)), True], + ["varm", gen_awkward((8, 20)), False], + ["varm", gen_awkward((8, None)), False], + ["uns", gen_awkward((7,)), True], + ["uns", gen_awkward((7, None)), True], + ["uns", gen_awkward((7, None, None)), True], + ], +) +def test_set_awkward(field, value, valid): + """Check if we can set obsm, .varm and .uns with different types + of awkward arrays and if error messages are properly raised when the dimensions do not align. + """ + adata = gen_adata((10, 20), varm_types=(), obsm_types=(), layers_types=()) + + def _assign(): + getattr(adata, field)["test"] = value + + if not valid: + with pytest.raises(ValueError): + _assign() + else: + _assign() + + +@pytest.mark.parametrize("key", ["obsm", "varm", "uns"]) +def test_copy(key): + """Check that modifying a copy does not modify the original""" + adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) + getattr(adata, key)["awk"] = ak.Array([{"a": [1], "b": [2], "c": [3]}] * 3) + adata_copy = adata.copy() + getattr(adata_copy, key)["awk"]["c"] = np.full((3, 1), 4) + getattr(adata_copy, key)["awk"]["d"] = np.full((3, 1), 5) + + # values in copy were correctly set + npt.assert_equal(getattr(adata_copy, key)["awk"]["c"], np.full((3, 1), 4)) + npt.assert_equal(getattr(adata_copy, key)["awk"]["d"], np.full((3, 1), 5)) + + # values in original were not updated + npt.assert_equal(getattr(adata, key)["awk"]["c"], np.full((3, 1), 3)) + with pytest.raises(IndexError): + getattr(adata, key)["awk"]["d"] + + +@pytest.mark.parametrize("key", ["obsm", "varm"]) +def test_view(key): + """Check that modifying a view does not modify the original""" + adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) + getattr(adata, key)["awk"] = ak.Array([{"a": [1], "b": [2], "c": [3]}] * 3) + adata_view = adata[:2, :2] + + with pytest.warns(ImplicitModificationWarning, match="initializing view as actual"): + getattr(adata_view, key)["awk"]["c"] = np.full((2, 1), 4) + getattr(adata_view, key)["awk"]["d"] = np.full((2, 1), 5) + + # values in view were correctly set + npt.assert_equal(getattr(adata_view, key)["awk"]["c"], np.full((2, 1), 4)) + npt.assert_equal(getattr(adata_view, key)["awk"]["d"], np.full((2, 1), 5)) + + # values in original were not updated + npt.assert_equal(getattr(adata, key)["awk"]["c"], np.full((3, 1), 3)) + with pytest.raises(IndexError): + getattr(adata, key)["awk"]["d"] + + +def test_view_of_awkward_array_with_custom_behavior(): + """Currently can't create view of arrays with custom __name__ (in this case "string") + See https://github.com/scverse/anndata/pull/647#discussion_r963494798_""" + adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) + adata.obsm["awk_string"] = ak.Array(["AAA", "BBB", "CCC"]) + adata_view = adata[:2] + + with pytest.raises(NotImplementedError): + adata_view.obsm["awk_string"] + + +@pytest.mark.parametrize( + "array", + [ + # numpy array + ak.Array(np.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))), + # record + ak.Array([{"a": 1, "b": 2}, {"a": 1, "b": 3}]), + # ListType, variable length + ak.Array([[1], [2, 3], [4, 5, 6]]), + # RegularType + nested ListType + ak.to_regular(ak.Array([[[1, 2], [3]], [[2], [3, 4, 5]]]), 1), + # nested record + ak.to_regular(ak.Array([[{"a": 0}, {"b": 1}], [{"c": 2}, {"d": 3}]]), 1), + # mixed types (variable length) + ak.Array([[1, 2], ["a"]]), + # zero-size edge cases + ak.Array(np.ones((0, 7))), + ak.Array(np.ones((7, 0))), + # UnionType of two regular types with different dimensions + ak.concatenate([ak.Array(np.ones((2, 2))), ak.Array(np.ones((2, 3)))]), + # UnionType of two regular types with same dimension + ak.concatenate( + [ + ak.Array(np.ones((2, 2))), + ak.Array(np.array([["a", "a"], ["a", "a"]])), + ] + ), + # categorical array + ak.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), + ak.to_categorical(ak.Array([[1, 1, 2], [3, 3]])), + # tyical record type with AIRR data consisting of different dtypes + ak.Array( + [ + [ + { + "v_call": "TRV1", + "junction_aa": "ADDEEKK", + "productive": True, + "locus": None, + "consensus_count": 3, + }, + { + "v_call": "TRV2", + "productive": False, + "locus": "TRA", + "consensus_count": 4, + }, + ], + [ + { + "v_call": None, + "junction_aa": "ADDEKK", + "productive": None, + "locus": "IGK", + "consensus_count": 3, + } + ], + ] + ), + ], +) +def test_awkward_io(tmp_path, array): + adata = AnnData() + adata.uns["awk"] = array + adata_path = tmp_path / "adata.h5ad" + adata.write_h5ad(adata_path) + + adata2 = read_h5ad(adata_path) + + assert_equal(adata.uns["awk"], adata2.uns["awk"], exact=True) + + +# @pytest.mark.parametrize("join", ["outer", "inner"]) +@pytest.mark.parametrize( + "arrays,join,expected", + [ + pytest.param( + [ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), None], + "inner", + None, + id="awk:recordoflists_null-inner", + ), + pytest.param( + [ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), None], + "outer", + ak.Array( + [{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, None, None, None] + ), + # maybe should return: ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, {}, {}, {}]), + id="awk:recordoflists_null-outer", + ), + pytest.param( + [ak.Array([[{"a": 1}, {"a": 2}], []]), None], + "outer", + ak.Array([[{"a": 1}, {"a": 2}], [], None, None, None]), + # maybe should return: ak.Array([[{"a": 1}, {"a": 2}], [], [], []]), + id="awk:listofrecords_null-outer", + ), + pytest.param( + [None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}])], + "inner", + None, + id="null_awk-inner", + ), + pytest.param( + [None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}])], + "outer", + ak.Array( + [None, None, None, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}] + ), + # maybe should return: ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + id="null_awk:recordoflists-outer", + ), + pytest.param( + [ + None, + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + pd.DataFrame(), + ], + "outer", + NotImplementedError, # TODO: ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + id="null_awk_empty-pd", + ), + pytest.param( + [ + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + pd.DataFrame(), + ], + "outer", + NotImplementedError, # TODO: ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + id="awk_empty-pd", + ), + pytest.param( + [ + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + pd.DataFrame().assign(a=[3, 4], b=[5, 6]), + ], + "outer", # TODO: Should try inner too if implemented + NotImplementedError, + ), + pytest.param( + [ + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + np.ones((3, 2)), + ], + "outer", + NotImplementedError, + ), + ], +) +@pytest.mark.parametrize("key", ["obsm", "varm"]) +def test_concat_mixed_types(key, arrays, expected, join): + """Test that concatenation of AwkwardArrays with arbitrary types, but zero length dimension + or missing values works.""" + axis = 0 if key == "obsm" else 1 + + to_concat = [] + cell_id, gene_id = 0, 0 + for a in arrays: + shape = np.array([3, 3]) # default shape (in case of missing array) + if a is not None: + length = dim_len(a, 0) + shape[axis] = length + + tmp_adata = gen_adata( + tuple(shape), varm_types=(), obsm_types=(), layers_types=() + ) + prev_cell_id, prev_gene_id = cell_id, gene_id + cell_id, gene_id = cell_id + shape[0], gene_id + shape[1] + tmp_adata.obs_names = pd.RangeIndex(prev_cell_id, cell_id).astype(str) + tmp_adata.var_names = pd.RangeIndex(prev_gene_id, gene_id).astype(str) + if a is not None: + if isinstance(a, pd.DataFrame): + a.set_index( + tmp_adata.obs_names if key == "obsm" else tmp_adata.var_names, + inplace=True, + ) + getattr(tmp_adata, key)["test"] = a + + to_concat.append(tmp_adata) + + if isinstance(expected, type) and issubclass(expected, Exception): + with pytest.raises(expected): + anndata.concat(to_concat, axis=axis, join=join) + else: + print(to_concat) + result_adata = anndata.concat(to_concat, axis=axis, join=join) + result = getattr(result_adata, key).get("test", None) + assert_equal(expected, result, exact=True) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 687503e52..f3dfb5ed7 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -25,7 +25,7 @@ GEN_ADATA_DASK_ARGS, ) from anndata.utils import asarray -from anndata.compat import DaskArray +from anndata.compat import DaskArray, AwkArray @singledispatch @@ -444,20 +444,27 @@ def get_obs_els(adata): adata1 = gen_adata((10, 10)) adata1.obsm = { - k: v for k, v in adata1.obsm.items() if not isinstance(v, pd.DataFrame) + k: v + for k, v in adata1.obsm.items() + if not isinstance(v, (pd.DataFrame, AwkArray)) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() - if not isinstance(v, pd.DataFrame) + if not isinstance(v, (pd.DataFrame, AwkArray)) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() - if not isinstance(v, pd.DataFrame) + if not isinstance(v, (pd.DataFrame, AwkArray)) } + # remove AwkArrays from adata.var, as outer joins are not yet implemented for them + for tmp_ad in [adata1, adata2, adata3]: + for k in [k for k, v in tmp_ad.varm.items() if isinstance(v, AwkArray)]: + del tmp_ad.varm[k] + joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) ptr = 0 @@ -680,6 +687,100 @@ def test_concatenate_with_raw(): assert adata_all.raw is None +def test_concatenate_awkward(join_type): + import awkward as ak + + a = ak.Array([[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]]) + b = ak.Array( + [ + [{"a": 4}, {"a": 5}], + [{"a": 6}], + [{"a": 7}], + ] + ) + + adata_a = AnnData(np.zeros((2, 0), dtype=float), obsm={"awk": a}) + adata_b = AnnData(np.zeros((3, 0), dtype=float), obsm={"awk": b}) + + if join_type == "inner": + expected = ak.Array( + [ + [{"a": 1}], + [{"a": 2}, {"a": 3}], + [{"a": 4}, {"a": 5}], + [{"a": 6}], + [{"a": 7}], + ] + ) + elif join_type == "outer": + # TODO: This is what we would like to return, but waiting on: + # * https://github.com/scikit-hep/awkward/issues/2182 and awkward 2.1.0 + # * https://github.com/scikit-hep/awkward/issues/2173 + # expected = ak.Array( + # [ + # [{"a": 1, "b": "foo"}], + # [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], + # [{"a": 4, "b": None}, {"a": 5, "b": None}], + # [{"a": 6, "b": None}], + # [{"a": 7, "b": None}], + # ] + # ) + expected = ak.concatenate( + [ # I don't think I can construct a UnionArray directly + ak.Array( + [ + [{"a": 1, "b": "foo"}], + [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], + ] + ), + ak.Array( + [ + [{"a": 4}, {"a": 5}], + [{"a": 6}], + [{"a": 7}], + ] + ), + ] + ) + + result = concat([adata_a, adata_b], join=join_type).obsm["awk"] + + assert_equal(expected, result) + + +@pytest.mark.parametrize( + "other", + [ + pd.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}, index=list("cde")), + np.ones((3, 2)), + sparse.random(3, 100, format="csr"), + ], +) +def test_awkward_does_not_mix(join_type, other): + import awkward as ak + + awk = ak.Array( + [[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]] + ) + + adata_a = AnnData( + np.zeros((2, 3), dtype=float), + obs=pd.DataFrame(index=list("ab")), + obsm={"val": awk}, + ) + adata_b = AnnData( + np.zeros((3, 3), dtype=float), + obs=pd.DataFrame(index=list("cde")), + obsm={"val": other}, + ) + + with pytest.raises( + NotImplementedError, + match="Cannot concatenate an AwkwardArray with other array types", + ): + concat([adata_a, adata_b], join=join_type) + + def test_pairwise_concat(axis, array_type): dim_sizes = [[100, 200, 50], [50, 50, 50]] if axis: @@ -1157,6 +1258,23 @@ def test_concat_size_0_dim(axis, join_type, merge_strategy, shape): alt_axis = 1 - axis dim = ("obs", "var")[axis] + # TODO: Remove, see: https://github.com/scverse/anndata/issues/905 + import awkward as ak + + if ( + (join_type == "inner") + and (merge_strategy in ("same", "unique")) + and ((axis, shape.index(0)) in [(0, 1), (1, 0)]) + and ak.__version__ == "2.0.7" # indicates if a release has happened + ): + aligned_mapping = (b.obsm, b.varm)[1 - axis] + to_remove = [] + for k, v in aligned_mapping.items(): + if isinstance(v, ak.Array): + to_remove.append(k) + for k in to_remove: + aligned_mapping.pop(k) + expected_size = expected_shape(a, b, axis=axis, join=join_type) result = concat( {"a": a, "b": b}, diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index f111783e8..f0fbcf656 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -6,7 +6,14 @@ from scipy import sparse import anndata as ad -from anndata.tests.helpers import assert_equal, report_name, gen_adata, asarray +from anndata.tests.helpers import ( + assert_equal, + gen_awkward, + report_name, + gen_adata, + asarray, +) +from anndata.utils import dim_len # Testing to see if all error types can have the key name appended. # Currently fails for 22/118 since they have required arguments. Not sure what to do about that. @@ -40,6 +47,33 @@ def reusable_adata(): return gen_adata((10, 10)) +@pytest.mark.parametrize( + "shape, datashape", + [ + [(4, 2), "4 * 2 * int32"], + [(100, 200, None), "100 * 200 * var * int32"], + [(4, None), "4 * var * int32"], + [(0, 4), "0 * 4 * int32"], + [(4, 0), "4 * 0 * int32"], + [(8, None, None), "8 * var * var * int32"], + [(8, None, None, None), "8 * var * var * var * int32"], + [(4, None, 8), "4 * var * 8 * int32"], + [(100, 200, 4), "100 * 200 * 4 * int32"], + [(4, 0, 0), "4 * 0 * 0 * int32"], + [(0, 0, 0), "0 * 0 * 0 * int32"], + [(0, None), "0 * var * int32"], + ], +) +def test_gen_awkward(shape, datashape): + import awkward as ak + + arr = gen_awkward(shape) + for i, s in enumerate(shape): + assert dim_len(arr, i) == s + arr_type = ak.types.from_datashape(datashape) + assert arr.type == arr_type + + # Does this work for every warning? def test_report_name(): def raise_error(): diff --git a/anndata/utils.py b/anndata/utils.py index bc00b0218..d9233b2b2 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -59,6 +59,109 @@ def convert_to_dict_nonetype(obj: None): return dict() +@singledispatch +def dim_len(x, axis): + """\ + Return the size of an array in dimension `axis`. + + Returns None if `x` is an awkward array with variable length in the requested dimension. + """ + return x.shape[axis] + + +try: + from .compat import awkward as ak + + def _size_at_depth(layout, depth, lateral_context, **kwargs): + """Callback function for dim_len_awkward, resolving the dim_len for a given level""" + if layout.is_numpy: + # if it's an embedded rectilinear array, we have to deal with its shape + # which might not be 1-dimensional + if layout.is_unknown: + shape = (0,) + else: + shape = layout.shape + numpy_axis = lateral_context["axis"] - depth + 1 + if not (1 <= numpy_axis < len(shape)): + raise TypeError(f"axis={lateral_context['axis']} is too deep") + lateral_context["out"] = shape[numpy_axis] + return ak.contents.EmptyArray() + + elif layout.is_list and depth == lateral_context["axis"]: + if layout.parameter("__array__") in ("string", "bytestring"): + # Strings are implemented like an array of lists of uint8 (ListType(NumpyType(...))) + # which results in an extra hierarchy-level that shouldn't show up in dim_len + # See https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3736747 + raise TypeError(f"axis={lateral_context['axis']} is too deep") + + if layout.is_regular: + # if it's a regular list, you want the size + lateral_context["out"] = layout.size + else: + # if it's an irregular list, you want a null token + lateral_context["out"] = -1 + return ak.contents.EmptyArray() + + elif layout.is_record: + # if it's a record, you want to stop descent with an error + raise TypeError( + f"axis={lateral_context['axis']} is too deep, reaches record" + ) + + elif layout.is_union: + # if it's a union, you could get the result of each union branch + # separately and see if they're all the same; if not, it's an error + result = None + for content in layout.contents: + context = {"axis": lateral_context["axis"]} + ak.transform( + _size_at_depth, + content, + lateral_context=context, + ) + if result is None: + result = context["out"] + elif result != context["out"]: + # Union branches have different lengths -> return null token + lateral_context["out"] = -1 + return ak.contents.EmptyArray() + lateral_context["out"] = result + return ak.contents.EmptyArray() + + @dim_len.register(ak.Array) + def dim_len_awkward(array, axis): + """Get the length of an awkward array in a given dimension + + Returns None if the dimension is of variable length. + + Code adapted from @jpivarski's solution in https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3521574 + """ + if axis < 0: # negative axis is another can of worms... maybe later + raise NotImplementedError("Does not support negative axis") + elif axis == 0: + return len(array) + else: + # communicate with the recursive function using a context (lateral) + context = {"axis": axis} + + # "transform" but we don't care what kind of array it returns + ak.transform( + _size_at_depth, + array, + lateral_context=context, + ) + + # Use `None` as null token. + return None if context["out"] == -1 else context["out"] + + @asarray.register(ak.Array) + def asarray_awkward(x): + return x + +except ImportError: + pass + + def make_index_unique(index: pd.Index, join: str = "-"): """ Makes the index unique by appending a number string to each duplicate index element: diff --git a/docs/fileformat-prose.md b/docs/fileformat-prose.md index 5493afa6c..c459a4329 100644 --- a/docs/fileformat-prose.md +++ b/docs/fileformat-prose.md @@ -315,6 +315,89 @@ values * The group MUST contain an boolean valued array under the key `"mask"` * The `"values"` and `"mask"` arrays MUST be the same shape +## AwkwardArrays + +```{warning} +**Experimental** + +Support for ragged arrays via awkward array is considered experimental under the 0.9.0 release series. +Please direct feedback on it's implementation to [https://github.com/scverse/anndata](https://github.com/scverse/anndata). +``` + +Ragged arrays are supported in `anndata` through the [Awkward +Array](https://awkward-array.org/) library. For storage on disk, we +break down the awkward array into it’s constituent arrays using +[`ak.to_buffers`](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html) +then writing these arrays using `anndata`’s methods. + +The container of arrays is stored in a group called `"container"` + + +```python +>>> import zarr +>>> z = zarr.open("airr.zarr", "r") +>>> awkward_group = z["obsm/airr"] +>>> awkward_group.tree() +``` + +``` +airr + └── container + ├── node0-offsets (17,) int64 + ├── node2-offsets (40,) int64 + ├── node3-data (117,) uint8 + ├── node4-offsets (40,) int64 + └── node5-data (117,) uint8 +``` + +The length of the array is saved to it’s own `"length"` attribute, +while metadata for the array structure is serialized and saved to the +`“form”` attribute. + +```python +>>> dict(awkward_group.attrs) +``` + + +```python +{ + 'encoding-type': 'awkward-array', + 'encoding-version': '0.1.0', + 'form': '{"class": "ListOffsetArray", "offsets": "i64", "content": {"class": ' + '"RecordArray", "contents": {"locus": {"class": "ListOffsetArray", ' + '"offsets": "i64", "content": {"class": "NumpyArray", "primitive": ' + '"uint8", "inner_shape": [], "has_identifier": false, "parameters": ' + '{"__array__": "char"}, "form_key": "node3"}, "has_identifier": ' + 'false, "parameters": {"__array__": "string"}, "form_key": "node2"}, ' + '"junction_aa": {"class": "ListOffsetArray", "offsets": "i64", ' + '"content": {"class": "NumpyArray", "primitive": "uint8", ' + '"inner_shape": [], "has_identifier": false, "parameters": ' + '{"__array__": "char"}, "form_key": "node5"}, "has_identifier": ' + 'false, "parameters": {"__array__": "string"}, "form_key": "node4"}}, ' + '"has_identifier": false, "parameters": {}, "form_key": "node1"}, ' + '"has_identifier": false, "parameters": {}, "form_key": "node0"}' + 'length': 16 +} +``` + +These can be read back as awkward arrays using the +[`ak.from_buffers`](https://awkward-array.readthedocs.io/en/latest/_auto/ak.from_buffers.html) +function: + +```python +>>> import awkward as ak +>>> from anndata.experimental import read_elem +>>> ak.from_buffers( +... awkward_group.attrs["form"], +... awkward_group.attrs["length"], +... {k: read_elem(v) for k, v in awkward_group.items()} +... ) +``` + +``` + +``` + [easy to find]: https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format) [hdf5]: https://en.wikipedia.org/wiki/Hierarchical_Data_Format \ No newline at end of file diff --git a/docs/release-notes/0.9.0.rst b/docs/release-notes/0.9.0.rst index 3bd619dda..82e6f2687 100644 --- a/docs/release-notes/0.9.0.rst +++ b/docs/release-notes/0.9.0.rst @@ -3,8 +3,10 @@ .. rubric:: Features + * Unordered categorical columns are no longer cast to object during :func:`anndata.concat` :pr:`763` :user:`ivirshup` * Added support for dask arrays :pr:`813` :user:`syelman` :user:`rahulbshrestha` +* `obsm`, `varm` and `uns` can now hold `AwkwardArrays `__ :pr:`647` :user:`giovp`, :user:`grst`, :user:`ivirshup` * Better error messages during IO :pr:`734` :user:`flying-sheep`, :user:`ivirshup` .. rubric:: Documentation diff --git a/pyproject.toml b/pyproject.toml index fb716da18..2fa1d10cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ doc = [ "nbsphinx", "scanpydoc>=0.7.7", "zarr", + "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks "myst_parser", ] @@ -86,6 +87,7 @@ test = [ "boltons", "scanpy", "dask[array]", + "awkward>=2.0.6", "pytest_memray", ]