diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 0d9496423..afc1b4153 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -18,14 +18,14 @@ jobs: python.version: "3.12" RUN_COVERAGE: yes TEST_TYPE: "coverage" - Python3.9: - python.version: "3.9" + Python3.10: + python.version: "3.10" PreRelease: python.version: "3.12" DEPENDENCIES_VERSION: "pre-release" TEST_TYPE: "strict-warning" minimum_versions: - python.version: "3.9" + python.version: "3.10" DEPENDENCIES_VERSION: "minimum" TEST_TYPE: "coverage" steps: @@ -88,6 +88,7 @@ jobs: inputs: codeCoverageTool: Cobertura summaryFileLocation: "test-data/coverage.xml" + failIfCoverageEmpty: true condition: eq(variables['TEST_TYPE'], 'coverage') - task: PublishTestResults@2 diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index f46beb094..b7355b6b5 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -25,7 +25,7 @@ jobs: ASV_DIR: "./benchmarks" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -33,18 +33,20 @@ jobs: if: ${{ github.ref_name != 'main' }} # Errors on main branch - - uses: mamba-org/setup-micromamba@v1 + - uses: mamba-org/setup-micromamba@v2 with: environment-name: asv cache-environment: true + # Deps documented in https://asv.readthedocs.io/en/latest/installing.html + # libmambapy upper bound: https://github.com/airspeed-velocity/asv/issues/1438 create-args: >- - python=3.11 + python=${{ matrix.python }} asv - mamba - packaging + libmambapy<2 + conda-build - name: Cache datasets - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cache diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index 4283ac780..97b2c689c 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -51,10 +51,20 @@ jobs: - name: Nvidia SMI sanity check run: nvidia-smi + - name: Install yq + run: | + sudo snap install yq + + - name: Extract max Python version from classifiers + run: | + classifiers=$(yq .project.classifiers pyproject.toml -oy | grep --only-matching --perl-regexp '(?<=Python :: )(\d\.\d+)') + max_version=$(echo "$classifiers" | sort -V | tail -1) + echo "max_python_version=$max_version" >> $GITHUB_ENV + - name: Install Python uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: ${{ env.max_python_version }} - name: Install UV uses: hynek/setup-cached-uv@v2 @@ -75,3 +85,10 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true verbose: true + + - name: Remove 'run-gpu-ci' Label + if: always() + uses: actions-ecosystem/action-remove-labels@v1 + with: + labels: "run-gpu-ci" + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2fba1b508..8b6fd222d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.2 + rev: v0.7.3 hooks: - id: ruff types_or: [python, pyi, jupyter] @@ -14,7 +14,7 @@ repos: exclude_types: - markdown - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer diff --git a/.readthedocs.yml b/.readthedocs.yml index 764eb57bd..8fa840e28 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -3,6 +3,13 @@ build: os: ubuntu-20.04 tools: python: "3.12" + jobs: + post_checkout: + # unshallow so version can be derived from tag + - git fetch --unshallow || true + pre_build: + # run towncrier to preview the next version’s release notes + - ( find docs/release-notes -regex '[^.]+[.][^.]+.md' | grep -q . ) && towncrier build --keep || true sphinx: configuration: docs/conf.py fail_on_warning: true # do not change or you will be fired diff --git a/README.md b/README.md index af784833a..c7ba77866 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![PyPI](https://img.shields.io/pypi/v/anndata.svg)](https://pypi.org/project/anndata) [![Downloads](https://static.pepy.tech/badge/anndata/month)](https://pepy.tech/project/anndata) [![Downloads](https://static.pepy.tech/badge/anndata)](https://pepy.tech/project/anndata) -[![Stars](https://img.shields.io/github/stars/scverse/anndata?logo=GitHub&color=yellow)](https://github.com/scverse/anndata/stargazers) +[![Stars](https://img.shields.io/github/stars/scverse/anndata?style=flat&logo=github&color=yellow)](https://github.com/scverse/anndata/stargazers) [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](http://numfocus.org) +## Public API + +Our public API is documented in the [API section][] of these docs. +We cannot guarantee the stability of our internal APIs, whether it's the location of a function, its arguments, or something else. +In other words, we do not officially support (or encourage users to do) something like `from anndata._core import AnnData` as `_core` is both not documented and contains a [leading underscore][]. +However, we are aware that [many users do use these internal APIs][] and thus encourage them to [open an issue][] or migrate to the public API. +That is, if something is missing from our public API as documented, for example a feature you wish to be exported publicly, please open an issue. + +[api section]: https://anndata.readthedocs.io/en/stable/api.html +[leading underscore]: https://peps.python.org/pep-0008/#public-and-internal-interfaces +[many users do use these internal APIs]: https://github.com/search?q=%22anndata._io%22&type=code +[open an issue]: https://github.com/scverse/anndata/issues/new/choose + + ## Citation -If you use `anndata` in your work, please cite the `anndata` pre-print as follows: +If you use `anndata` in your work, please cite the `anndata` publication as follows: > **anndata: Annotated data** > > Isaac Virshup, Sergei Rybakov, Fabian J. Theis, Philipp Angerer, F. Alexander Wolf > -> _bioRxiv_ 2021 Dec 19. doi: [10.1101/2021.12.16.473007](https://doi.org/10.1101/2021.12.16.473007). +> _JOSS_ 2024 Sep 16. doi: [10.21105/joss.04371](https://doi.org/10.21105/joss.04371). You can cite the scverse publication as follows: diff --git a/benchmarks/benchmarks/sparse_dataset.py b/benchmarks/benchmarks/sparse_dataset.py index 7d217d159..3a6d0dac6 100644 --- a/benchmarks/benchmarks/sparse_dataset.py +++ b/benchmarks/benchmarks/sparse_dataset.py @@ -5,7 +5,8 @@ from scipy import sparse from anndata import AnnData -from anndata.experimental import sparse_dataset, write_elem +from anndata._core.sparse_dataset import sparse_dataset +from anndata._io.specs import write_elem def make_alternating_mask(n): diff --git a/ci/scripts/min-deps.py b/ci/scripts/min-deps.py index b5b0b980e..c6bac0cf4 100755 --- a/ci/scripts/min-deps.py +++ b/ci/scripts/min-deps.py @@ -1,4 +1,4 @@ -#!python3 +#!/usr/bin/env python3 from __future__ import annotations import argparse diff --git a/ci/scripts/towncrier_automation.py b/ci/scripts/towncrier_automation.py new file mode 100755 index 000000000..17fd10902 --- /dev/null +++ b/ci/scripts/towncrier_automation.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import subprocess +from typing import TYPE_CHECKING + +from packaging.version import Version + +if TYPE_CHECKING: + from collections.abc import Sequence + + +class Args(argparse.Namespace): + version: str + dry_run: bool + + +def parse_args(argv: Sequence[str] | None = None) -> Args: + parser = argparse.ArgumentParser( + prog="towncrier-automation", + description=( + "This script runs towncrier for a given version, " + "creates a branch off of the current one, " + "and then creates a PR into the original branch with the changes. " + "The PR will be backported to main if the current branch is not main." + ), + ) + parser.add_argument( + "version", + type=str, + help=( + "The new version for the release must have at least three parts, like `major.minor.patch` and no `major.minor`. " + "It can have a suffix like `major.minor.patch.dev0` or `major.minor.0rc1`." + ), + ) + parser.add_argument( + "--dry-run", + help="Whether or not to dry-run the actual creation of the pull request", + action="store_true", + ) + args = parser.parse_args(argv, Args()) + # validate the version + if len(Version(args.version).release) != 3: + msg = f"Version argument {args.version} must contain major, minor, and patch version." + raise ValueError(msg) + return args + + +def main(argv: Sequence[str] | None = None) -> None: + args = parse_args(argv) + + # Run towncrier + subprocess.run( + ["towncrier", "build", f"--version={args.version}", "--yes"], check=True + ) + + # Check if we are on the main branch to know if we need to backport + base_branch = subprocess.run( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + capture_output=True, + text=True, + check=True, + ).stdout.strip() + pr_description = "" if base_branch == "main" else "@meeseeksdev backport to main" + branch_name = f"release_notes_{args.version}" + + # Create a new branch + commit + subprocess.run(["git", "switch", "-c", branch_name], check=True) + subprocess.run(["git", "add", "docs/release-notes"], check=True) + pr_title = f"(chore): generate {args.version} release notes" + subprocess.run(["git", "commit", "-m", pr_title], check=True) + + # push + if not args.dry_run: + subprocess.run( + ["git", "push", "--set-upstream", "origin", branch_name], check=True + ) + else: + print("Dry run, not pushing") + + # Create a PR + subprocess.run( + [ + "gh", + "pr", + "create", + f"--base={base_branch}", + f"--title={pr_title}", + f"--body={pr_description}", + "--label=skip-gpu-ci", + *(["--label=no milestone"] if base_branch == "main" else []), + *(["--dry-run"] if args.dry_run else []), + ], + check=True, + ) + + # Enable auto-merge + if not args.dry_run: + subprocess.run( + ["gh", "pr", "merge", branch_name, "--auto", "--squash"], check=True + ) + else: + print("Dry run, not merging") + + +if __name__ == "__main__": + main() diff --git a/docs/api.md b/docs/api.md index 92139fe06..60cbbf61c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -15,7 +15,8 @@ The central class: ## Combining -Combining AnnData objects. See also the section on concatenation. +Combining {class}`AnnData` objects. +See also the section on concatenation. ```{eval-rst} .. autosummary:: @@ -26,44 +27,67 @@ Combining AnnData objects. See also the section on concatenation. ## Reading -Reading anndata’s native file format `.h5ad`. +Reading anndata’s native formats `.h5ad` and `zarr`. ```{eval-rst} .. autosummary:: :toctree: generated/ - read_h5ad + io.read_h5ad + io.read_zarr ``` -Reading other file formats. +Reading individual portions ({attr}`~AnnData.obs`, {attr}`~AnnData.varm` etc.) of the {class}`AnnData` object. ```{eval-rst} .. autosummary:: :toctree: generated/ - read_csv - read_excel - read_hdf - read_loom - read_mtx - read_text - read_umi_tools - read_zarr + io.read_elem + io.sparse_dataset +``` + +Reading file formats that cannot represent all aspects of {class}`AnnData` objects. + +```{tip} +You might have more success by assembling the {class}`AnnData` object yourself from the individual parts. +``` + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + io.read_csv + io.read_excel + io.read_hdf + io.read_loom + io.read_mtx + io.read_text + io.read_umi_tools ``` ## Writing -Writing to anndata’s native file format `.h5ad`. +Writing a complete {class}`AnnData` object to disk in anndata’s native formats `.h5ad` and `zarr`. ```{eval-rst} .. autosummary:: :toctree: generated/ AnnData.write + AnnData.write_zarr +``` + +Writing individual portions ({attr}`~AnnData.obs`, {attr}`~AnnData.varm` etc.) of the {class}`AnnData` object. + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + + io.write_elem ``` -Writing to other formats. +Writing formats that cannot represent all aspects of {class}`AnnData` objects. ```{eval-rst} .. autosummary:: @@ -71,7 +95,6 @@ Writing to other formats. AnnData.write_csvs AnnData.write_loom - AnnData.write_zarr ``` (experimental-api)= @@ -79,10 +102,10 @@ Writing to other formats. ## Experimental API ```{warning} -API's in the experimental module are currently in development and subject to change at any time. +APIs in the experimental module are currently in development and subject to change at any time. ``` -Two classes for working with batched access to collections of many `AnnData` objects or `h5ad` files. +Two classes for working with batched access to collections of many {class}`AnnData` objects or `.h5ad` files. In particular, for pytorch-based models. ```{eval-rst} @@ -93,17 +116,6 @@ In particular, for pytorch-based models. experimental.AnnLoader ``` -Interface for accessing on-disk sparse data: - -```{eval-rst} -.. autosummary:: - :toctree: generated/ - - experimental.sparse_dataset - experimental.CSRDataset - experimental.CSCDataset -``` - Out of core concatenation ```{eval-rst} @@ -113,14 +125,12 @@ Out of core concatenation experimental.concat_on_disk ``` -Low level methods for reading and writing elements of an `AnnData` object to a store: +Low level methods for reading and writing elements of an {class}`AnnData` object to a store: ```{eval-rst} .. autosummary:: :toctree: generated/ - experimental.read_elem - experimental.write_elem experimental.read_elem_as_dask ``` @@ -141,8 +151,6 @@ Types used by the former: :toctree: generated/ experimental.IOSpec - experimental.InMemoryElem - experimental.RWAble experimental.Read experimental.Write experimental.ReadCallback @@ -168,3 +176,16 @@ Types used by the former: settings settings.override ``` + +## Custom Types/Classes for Readable/Writeable Elements + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + + abc.CSRDataset + abc.CSCDataset + typing.Index + typing.AxisStorable + typing.RWAble +``` diff --git a/docs/benchmark-read-write.ipynb b/docs/benchmark-read-write.ipynb index 886bfa0f6..365a585ec 100644 --- a/docs/benchmark-read-write.ipynb +++ b/docs/benchmark-read-write.ipynb @@ -159,7 +159,7 @@ ], "source": [ "%%time\n", - "adata = ad.read_loom(\"test.loom\")" + "adata = ad.io.read_loom(\"test.loom\")" ] } ], diff --git a/docs/concatenation.rst b/docs/concatenation.rst index be644dceb..ce6547b66 100644 --- a/docs/concatenation.rst +++ b/docs/concatenation.rst @@ -54,8 +54,8 @@ When the variables present in the objects to be concatenated aren't exactly the This is otherwise called taking the `"inner"` (intersection) or `"outer"` (union) join. For example, given two anndata objects with differing variables: - >>> a = AnnData(sparse.eye(3), var=pd.DataFrame(index=list("abc"))) - >>> b = AnnData(sparse.eye(2), var=pd.DataFrame(index=list("ba"))) + >>> a = AnnData(sparse.eye(3, format="csr"), var=pd.DataFrame(index=list("abc"))) + >>> b = AnnData(sparse.eye(2, format="csr"), var=pd.DataFrame(index=list("ba"))) >>> ad.concat([a, b], join="inner").X.toarray() array([[1., 0.], [0., 1.], @@ -208,11 +208,11 @@ Note that comparisons are made after indices are aligned. That is, if the objects only share a subset of indices on the alternative axis, it's only required that values for those indices match when using a strategy like `"same"`. >>> a = AnnData( - ... sparse.eye(3), + ... sparse.eye(3, format="csr"), ... var=pd.DataFrame({"nums": [1, 2, 3]}, index=list("abc")) ... ) >>> b = AnnData( - ... sparse.eye(2), + ... sparse.eye(2, format="csr"), ... var=pd.DataFrame({"nums": [2, 1]}, index=list("ba")) ... ) >>> ad.concat([a, b], merge="same").var diff --git a/docs/conf.py b/docs/conf.py index 6a0006a70..f98fe5ba7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,7 +13,8 @@ from sphinx.application import Sphinx HERE = Path(__file__).parent -sys.path[:0] = [str(HERE / "extensions")] +_extension_dir = HERE / "extensions" +sys.path[:0] = [str(_extension_dir)] # -- General configuration ------------------------------------------------ @@ -61,11 +62,8 @@ "sphinx.ext.linkcode", "nbsphinx", "IPython.sphinxext.ipython_console_highlighting", - "patch_sphinx_toolbox_autoprotocol", # internal extension "sphinx_toolbox.more_autodoc.autoprotocol", - # other internal extensions - "patch_myst_cite", - "release_notes", + *(p.stem for p in _extension_dir.glob("*.py")), ] myst_enable_extensions = [ "html_image", # So README.md can be used on github and sphinx docs @@ -112,6 +110,10 @@ ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), + # https://github.com/sphinx-doc/sphinx/issues/10974 + ("py:class", "numpy.int64"), + # https://github.com/tox-dev/sphinx-autodoc-typehints/issues/498 + ("py:class", "types.EllipsisType"), ] @@ -121,17 +123,17 @@ def setup(app: Sphinx): intersphinx_mapping = dict( - h5py=("https://docs.h5py.org/en/latest/", None), - hdf5plugin=("https://hdf5plugin.readthedocs.io/en/latest/", None), - loompy=("https://linnarssonlab.org/loompy/", None), - numpy=("https://numpy.org/doc/stable/", None), - pandas=("https://pandas.pydata.org/pandas-docs/stable/", None), + h5py=("https://docs.h5py.org/en/latest", None), + hdf5plugin=("https://hdf5plugin.readthedocs.io/en/latest", None), + loompy=("https://linnarssonlab.org/loompy", None), + numpy=("https://numpy.org/doc/stable", None), + pandas=("https://pandas.pydata.org/pandas-docs/stable", None), python=("https://docs.python.org/3", None), - scipy=("https://docs.scipy.org/doc/scipy/", None), - sklearn=("https://scikit-learn.org/stable/", None), - zarr=("https://zarr.readthedocs.io/en/stable/", None), - xarray=("https://xarray.pydata.org/en/stable/", None), - dask=("https://docs.dask.org/en/stable/", None), + scipy=("https://docs.scipy.org/doc/scipy", None), + sklearn=("https://scikit-learn.org/stable", None), + zarr=("https://zarr.readthedocs.io/en/stable", None), + xarray=("https://docs.xarray.dev/en/stable", None), + dask=("https://docs.dask.org/en/stable", None), ) qualname_overrides = { "h5py._hl.group.Group": "h5py.Group", @@ -142,13 +144,12 @@ def setup(app: Sphinx): "anndata._types.WriteCallback": "anndata.experimental.WriteCallback", "anndata._types.Read": "anndata.experimental.Read", "anndata._types.Write": "anndata.experimental.Write", - "anndata._types.RWAble": "anndata.experimental.RWAble", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", - RWAble=":data:`~anndata.experimental.RWAble`", + AxisStorable=":data:`~anndata.typing.AxisStorable`", **{ - f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryElem`" + f"{v}variantRWAble": ":data:`~anndata.typing.RWAble`" for v in ["In", "Co", "Contra"] }, ) diff --git a/docs/contributing.md b/docs/contributing.md index c45cc032e..d16020a4f 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -6,6 +6,21 @@ AnnData follows the development practices outlined in the [Scanpy contribution g .. include:: _key_contributors.rst ``` +## Release Notes + +AnnData differs from `scanpy` (for now) in how its releases are done. +It uses [towncrier][] to build its changelog. +We have set up some automation around this process. +To run `towncrier`, create a `PR` into the base branch of the release with the compiled changelog, and backport to `main` if needed (i.e., the base branch is something like `0.10.x`), run + +```shell +hatch run towncrier:build X.Y.Z +``` + +You may add the option `--dry-run` at the end to do the local steps without pushing to Github, although the push will be mocked via [`gh pr --dry-run`](https://cli.github.com/manual/gh_pr_create). + +[towncrier]: https://towncrier.readthedocs.io/en/stable/ + ## CI ### GPU CI diff --git a/docs/extensions/no_skip_abc_members.py b/docs/extensions/no_skip_abc_members.py new file mode 100644 index 000000000..66846e095 --- /dev/null +++ b/docs/extensions/no_skip_abc_members.py @@ -0,0 +1,28 @@ +"""Sphinx extension to not skip abstract methods.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import Literal + + from sphinx.application import Sphinx + from sphinx.ext.autodoc import Options + + +def autodoc_skip_member( + app: Sphinx, + what: Literal["module", "class", "exception", "function", "method", "attribute"], + name: str, + obj: object, + skip: bool, + options: Options, +): + if what == "method" and getattr(obj, "__isabstractmethod__", False): + return False + return None + + +def setup(app: Sphinx): + app.connect("autodoc-skip-member", autodoc_skip_member) diff --git a/docs/extensions/patch_sphinx_toolbox_autoprotocol.py b/docs/extensions/patch_sphinx_toolbox_autoprotocol.py deleted file mode 100644 index bafe24cc4..000000000 --- a/docs/extensions/patch_sphinx_toolbox_autoprotocol.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from sphinx.ext.autodoc import ObjectMember -from sphinx_toolbox.more_autodoc.autoprotocol import ProtocolDocumenter - -if TYPE_CHECKING: - from typing import Self - - from sphinx.application import Sphinx - - -def patch_sphinx_toolbox_autoprotocol(): - """Compat hack: https://github.com/sphinx-toolbox/sphinx-toolbox/issues/168""" - - class ObjectMemberCompat(ObjectMember): - @classmethod - def from_other(cls, other: ObjectMember) -> Self: - return cls( - other.__name__, - other.object, - docstring=other.docstring, - class_=other.class_, - skipped=other.skipped, - ) - - def __iter__(self): - return iter([self.__name__, self.object]) - - filter_orig = ProtocolDocumenter.filter_members - - def filter_members( - self, members: list[ObjectMember], want_all: bool - ) -> list[tuple[str, object, bool]]: - member_tuples = [ObjectMemberCompat.from_other(m) for m in members] - return filter_orig(self, member_tuples, want_all) - - ProtocolDocumenter.filter_members = filter_members - - -def setup(_app: Sphinx) -> None: - patch_sphinx_toolbox_autoprotocol() diff --git a/docs/extensions/release_notes.py b/docs/extensions/release_notes.py deleted file mode 100644 index bb28453a7..000000000 --- a/docs/extensions/release_notes.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import annotations - -import itertools -import re -from pathlib import Path -from typing import TYPE_CHECKING - -from docutils import nodes -from packaging.version import Version -from sphinx.util.docutils import SphinxDirective - -if TYPE_CHECKING: - from collections.abc import Iterable, Sequence - from typing import ClassVar - - from myst_parser.mdit_to_docutils.base import DocutilsRenderer - from sphinx.application import Sphinx - - -FULL_VERSION_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)$") - - -class ReleaseNotes(SphinxDirective): - required_arguments: ClassVar = 1 - - def run(self) -> Sequence[nodes.Node]: - dir_ = Path(self.arguments[0]) - # resolve relative dir - if not dir_.is_absolute(): - src_file = Path(self.get_source_info()[0]) - if not src_file.is_file(): - msg = f"Cannot find relative path to: {src_file}" - raise self.error(msg) - dir_ = src_file.parent / self.arguments[0] - if not dir_.is_dir(): - msg = f"Not a directory: {dir_}" - raise self.error(msg) - - versions = sorted( - ( - (Version(f.stem), f) - for f in dir_.iterdir() - if FULL_VERSION_RE.match(f.stem) - ), - reverse=True, # descending - ) - version_groups = itertools.groupby( - versions, key=lambda vf: (vf[0].major, vf[0].minor) - ) - for (major, minor), versions in version_groups: - self.render_version_group(major, minor, versions) - return [] - - def render_version_group( - self, major: int, minor: int, versions: Iterable[tuple[Version, Path]] - ) -> None: - target = nodes.target( - ids=[f"v{major}-{minor}"], - names=[f"v{major}.{minor}"], - ) - section = nodes.section( - "", - nodes.title("", f"Version {major}.{minor}"), - ids=[], - names=[f"version {major}.{minor}"], - ) - self.state.document.note_implicit_target(section) - self.state.document.note_explicit_target(target) - # append target and section to parent - self.renderer.current_node.append(target) - self.renderer.update_section_level_state(section, 2) - # append children to section - with self.renderer.current_node_context(section): - for _, p in versions: - self.render_include(p) - - def render_include(self, path: Path) -> None: - # hacky solution because of https://github.com/executablebooks/MyST-Parser/issues/967 - from docutils.parsers.rst.directives.misc import Include - from myst_parser.mocking import MockIncludeDirective - - srcfile, lineno = self.get_source_info() - parent_dir = Path(srcfile).parent - - d = MockIncludeDirective( - renderer=self.renderer, - name=type(self).__name__, - klass=Include, # type: ignore # wrong type hint - arguments=[str(path.relative_to(parent_dir))], - options={}, - body=[], - lineno=lineno, - ) - d.run() - - # TODO: replace the above with this once the above mentioned bug is fixed - # from sphinx.util.parsing import nested_parse_to_nodes - # return nested_parse_to_nodes( - # self.state, - # path.read_text(), - # source=str(path), - # offset=self.content_offset, - # ) - - @property - def renderer(self) -> DocutilsRenderer: - return self.state._renderer - - -def setup(app: Sphinx) -> None: - app.add_directive("release-notes", ReleaseNotes) diff --git a/docs/fileformat-prose.md b/docs/fileformat-prose.md index 3fdc68788..831b441e9 100644 --- a/docs/fileformat-prose.md +++ b/docs/fileformat-prose.md @@ -476,7 +476,7 @@ That is, we store an indicator array (or mask) of null values alongside the arra :sync: hdf5 ```python ->>> from anndata.experimental import write_elem +>>> from anndata import write_elem >>> null_store = h5py.File("tmp.h5", mode="w") >>> int_array = pd.array([1, None, 3, 4]) >>> int_array @@ -498,7 +498,7 @@ nullable_integer/values :sync: zarr ```python ->>> from anndata.experimental import write_elem +>>> from anndata import write_elem >>> null_store = zarr.open() >>> int_array = pd.array([1, None, 3, 4]) >>> int_array @@ -635,7 +635,7 @@ function: ```python >>> import awkward as ak ->>> from anndata.experimental import read_elem +>>> from anndata.io import read_elem >>> awkward_group = store["varm/transcript"] >>> ak.from_buffers( ... awkward_group.attrs["form"], diff --git a/docs/release-notes/0.10.0.md b/docs/release-notes/0.10.0.md index a4ed8a826..586850969 100644 --- a/docs/release-notes/0.10.0.md +++ b/docs/release-notes/0.10.0.md @@ -13,7 +13,7 @@ * Concatenate on-disk anndata objects with {func}`anndata.experimental.concat_on_disk` {pr}`955` {user}`selmanozleyen` * AnnData can now hold dask arrays with `scipy.sparse.spmatrix` chunks {pr}`1114` {user}`ivirshup` -* Public API for interacting with on disk sparse arrays: {func}`~anndata.experimental.sparse_dataset`, {class}`~anndata.experimental.CSRDataset`, and {class}`~anndata.experimental.CSCDataset` {pr}`765` {user}`ilan-gold` {user}`ivirshup` +* Public API for interacting with on disk sparse arrays: {func}`~anndata.io.sparse_dataset`, {class}`~anndata.abc.CSRDataset`, and {class}`~anndata.abc.CSCDataset` {pr}`765` {user}`ilan-gold` {user}`ivirshup` * Improved performance for simple slices of OOC sparse arrays {pr}`1131` {user}`ivirshup` **Improved errors and warnings** @@ -37,7 +37,7 @@ #### Deprecations -* Deprecate `anndata.read`, which was just an alias for {func}`anndata.read_h5ad` {pr}`1108` {user}`ivirshup`. +* Deprecate `anndata.read`, which was just an alias for {func}`anndata.io.read_h5ad` {pr}`1108` {user}`ivirshup`. * `dtype` argument to `AnnData` constructor is now deprecated {pr}`1153` {user}`ivirshup` #### Bug fixes diff --git a/docs/release-notes/0.10.1.md b/docs/release-notes/0.10.1.md index dae8af856..858d2f5fd 100644 --- a/docs/release-notes/0.10.1.md +++ b/docs/release-notes/0.10.1.md @@ -1,6 +1,6 @@ (v0.10.1)= ### 0.10.1 {small}`2023-10-08` -#### Bugfix +#### Bug fixes * Fix `ad.concat` erroring when concatenating a categorical and object column {pr}`1171` {user}`ivirshup` diff --git a/docs/release-notes/0.10.2.md b/docs/release-notes/0.10.2.md index ea878abcf..e2b8b36fd 100644 --- a/docs/release-notes/0.10.2.md +++ b/docs/release-notes/0.10.2.md @@ -1,10 +1,10 @@ (v0.10.2)= ### 0.10.2 {small}`2023-10-11` -#### Bugfix +#### Bug fixes * Added compatibility layer for packages relying on `anndata._core.sparse_dataset.SparseDataset`. - Note that this API is *deprecated* and new code should use {class}`~anndata.experimental.CSRDataset`, {class}`~anndata.experimental.CSCDataset`, and {func}`~anndata.experimental.sparse_dataset` instead. + Note that this API is *deprecated* and new code should use `anndata.CSRDataset`, `~anndata.CSCDataset`, and `anndata.sparse_dataset` instead. {pr}`1185` {user}`ivirshup` * Handle deprecation warning from `pd.Categorical.map` thrown during `anndata.concat` {pr}`1189` {user}`flying-sheep` {user}`ivirshup` * Fixed extra steps being included in IO tracebacks {pr}`1193` {user}`flying-sheep` diff --git a/docs/release-notes/0.10.3.md b/docs/release-notes/0.10.3.md index 4e5918d40..022b61050 100644 --- a/docs/release-notes/0.10.3.md +++ b/docs/release-notes/0.10.3.md @@ -1,7 +1,7 @@ (v0.10.3)= ### 0.10.3 {small}`2023-10-31` -#### Bugfix +#### Bug fixes * Prevent pandas from causing infinite recursion when setting a slice of a categorical column {pr}`1211` {user}`flying-sheep` #### Documentation diff --git a/docs/release-notes/0.10.4.md b/docs/release-notes/0.10.4.md index 46ec222a9..592593163 100644 --- a/docs/release-notes/0.10.4.md +++ b/docs/release-notes/0.10.4.md @@ -1,7 +1,7 @@ (v0.10.4)= ### 0.10.4 {small}`2024-01-04` -#### Bugfix +#### Bug fixes * Only try to use `Categorical.map(na_action=…)` in actually supported Pandas ≥2.1 {pr}`1226` {user}`flying-sheep` * `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko` * `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold` diff --git a/docs/release-notes/0.10.5.md b/docs/release-notes/0.10.5.md index edb7db10b..8987285b7 100644 --- a/docs/release-notes/0.10.5.md +++ b/docs/release-notes/0.10.5.md @@ -1,7 +1,7 @@ (v0.10.5)= ### 0.10.5 {small}`2024-01-25` -#### Bugfix +#### Bug fixes * Fix outer concatenation along variables when only a subset of objects had an entry in layers {pr}`1291` {user}`ivirshup` * Fix comparison of >2d arrays in `uns` during concatenation {pr}`1300` {user}`ivirshup` diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md index 4bef8f562..e26fdf49d 100644 --- a/docs/release-notes/0.10.6.md +++ b/docs/release-notes/0.10.6.md @@ -1,7 +1,7 @@ (v0.10.6)= ### 0.10.6 {small}`2024-03-11` -#### Bugfix +#### Bug fixes * Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold` * Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup` diff --git a/docs/release-notes/0.10.7.md b/docs/release-notes/0.10.7.md index f3ea34cd0..1832b95a4 100644 --- a/docs/release-notes/0.10.7.md +++ b/docs/release-notes/0.10.7.md @@ -1,7 +1,7 @@ (v0.10.7)= ### 0.10.7 {small}`2024-04-09` -#### Bugfix +#### Bug fixes * Handle upstream `numcodecs` bug where read-only string arrays cannot be encoded {user}`ivirshup` {pr}`1421` * Use in-memory sparse matrix directly to fix compatibility with `scipy` `1.13` {user}`ilan-gold` {pr}`1435` diff --git a/docs/release-notes/0.10.8.md b/docs/release-notes/0.10.8.md index 324c9d571..d69102d51 100644 --- a/docs/release-notes/0.10.8.md +++ b/docs/release-notes/0.10.8.md @@ -1,12 +1,12 @@ (v0.10.8)= ### 0.10.8 {small}`2024-06-20` -#### Bugfix +#### Bug fixes * Write out `64bit` indptr when appropriate for {func}`~anndata.experimental.concat_on_disk` {pr}`1493` {user}`ilan-gold` * Support for Numpy 2 {pr}`1499` {user}`flying-sheep` -* Fix {func}`~anndata.experimental.sparse_dataset` docstring test on account of new {mod}`scipy` version {pr}`1514` {user}`ilan-gold` +* Fix {func}`~anndata.io.sparse_dataset` docstring test on account of new {mod}`scipy` version {pr}`1514` {user}`ilan-gold` #### Documentation -* Improved example for {func}`~anndata.experimental.sparse_dataset` {pr}`1468` {user}`ivirshup` +* Improved example for {func}`~anndata.io.sparse_dataset` {pr}`1468` {user}`ivirshup` diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md index 012e5e89f..ae0aadb54 100644 --- a/docs/release-notes/0.10.9.md +++ b/docs/release-notes/0.10.9.md @@ -1,7 +1,7 @@ (v0.10.9)= ### 0.10.9 {small}`2024-08-28` -#### Bugfix +#### Bug fixes - Fix writing large number of columns for `h5` files {user}`ilan-gold` {user}`selmanozleyen` ({pr}`1147`) - Add warning for setting `X` on a view with repeated indices {user}`ilan-gold` ({pr}`1501`) @@ -16,7 +16,7 @@ - create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`) -#### Doc +#### Documentation - add `callback` typing for {func}`~anndata.experimental.read_dispatched` and {func}`~anndata.experimental.write_dispatched` {user}`ilan-gold` ({pr}`1557`) diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md new file mode 100644 index 000000000..317175f50 --- /dev/null +++ b/docs/release-notes/0.11.0.md @@ -0,0 +1,49 @@ +(v0.11.0)= +### 0.11.0 {small}`2024-11-07` + +Release candidates: + +- (v0.11.0rc3)= + {guilabel}`rc3` 2024-10-14 +- (v0.11.0rc2)= + {guilabel}`rc2` 2024-09-24 +- (v0.11.0rc1)= + {guilabel}`rc1` 2024-09-04 + +#### Bug fixes + +- Ensure {func}`anndata.concat` of {class}`~anndata.AnnData` object with {class}`scipy.sparse.spmatrix` and {class}`scipy.sparse.sparray` dask arrays uses the correct fill value of 0. {user}`ilan-gold` ({pr}`1719`) +- Ensure that views of AwkwardArrays have their "view" attributes removed on saving an {class}`~anndata.AnnData` object to disk. {user}`grst` ({pr}`1736`) + +#### Breaking changes + +- {guilabel}`rc3` Drop support for `python` 3.9 {user}`ilan-gold` ({pr}`1712`) +- {guilabel}`rc2` A new `anndata.io` module contains all `read_*` and `write_*` functions, and all imports of such functions should go through this module. Old ways of importing these functions i.e., `from anndata import read_csv` or `from anndata._io.specs import read_elem` will still work, but are now considered deprecated and give a warning on import with the exception of {func}`anndata.io.read_zarr` and {func}`anndata.io.read_h5ad`, which will remain at the top-level `anndata` without warning. {user}`ilan-gold ({pr}`1682`) +- {guilabel}`rc1` Removed deprecated modules `anndata.core` and `anndata.readwrite` {user}`ivirshup` ({pr}`1197`) +- {guilabel}`rc1` No longer export `sparse_dataset` from `anndata.experimental`, instead exporting {func}`anndata.io.sparse_dataset` {user}`ilan-gold` ({pr}`1642`) +- {guilabel}`rc1` Move `RWAble` and `InMemoryElem` out of `experimental`, renaming `RWAble` to {type}`~anndata.typing.AxisStorable` and `InMemoryElem` to {type}`~anndata.typing.RWAble` {user}`ilan-gold` ({pr}`1643`) + +#### Development Process + +- {guilabel}`rc2` Add extra `dask` dependency for installation i.e., `pip install anndata[dask]` {user}`ilan-gold` ({pr}`1677`) +- {guilabel}`rc2` Remove `shall_` from variable names in `settings` {user}`ilan-gold` ({pr}`1685`) +- {guilabel}`rc1` Create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` ({pr}`1596`) + +#### Documentation + +- {guilabel}`rc1` Correct {attr}`anndata.AnnData.X` type to include {class}`~anndata.abc.CSRDataset` and {class}`~anndata.abc.CSCDataset` as possible types and being deprecation process for non-csr/csc {class}`scipy.sparse.spmatrix` types in {attr}`anndata.AnnData.X` {user}`ilan-gold` ({pr}`1616`) + +#### Features + +- Add support for ellipsis indexing of the {class}`~anndata.AnnData` object {user}`ilan-gold` ({pr}`1729`) +- {guilabel}`rc1` `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {user}`ilan-gold` {user}`isaac-virshup` ({pr}`1028`) +- {guilabel}`rc1` Allow `axis` parameter of e.g. {func}`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` ({pr}`1244`) +- {guilabel}`rc1` Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` ({pr}`1270`) +- {guilabel}`rc1` Add {attr}`~anndata.settings.remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1340`) +- {guilabel}`rc1` Add {func}`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` ({pr}`1469`) +- {guilabel}`rc1` Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` ({pr}`1474`) +- {guilabel}`rc1` Add {attr}`~anndata.settings.check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` ({pr}`1507`) +- {guilabel}`rc1` Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` ({pr}`1550`) +- {guilabel}`rc1` Read and write support for nullable string arrays ({class}`pandas.arrays.StringArray`). Use pandas’ {doc}`pandas:user_guide/options` `mode.string_storage` to control which storage mode is used when reading `dtype="string"` columns. {user}`flying-sheep` ({pr}`1558`) +- {guilabel}`rc1` Export {func}`~anndata.io.write_elem` and {func}`~anndata.io.read_elem` directly from the main package instead of `experimental` {user}`ilan-gold` ({pr}`1598`) +- {guilabel}`rc1` Allow reading sparse data (via {func}`~anndata.io.read_elem` or {func}`~anndata.io.sparse_dataset`) into either {class}`scipy.sparse.csr_array` or {class}`scipy.sparse.csc_array` via {attr}`anndata.settings.use_sparse_array_on_read` {user}`ilan-gold` ({pr}`1633`) diff --git a/docs/release-notes/0.11.1.md b/docs/release-notes/0.11.1.md new file mode 100644 index 000000000..8725ecf23 --- /dev/null +++ b/docs/release-notes/0.11.1.md @@ -0,0 +1,8 @@ +(v0.11.1)= +### 0.11.1 {small}`2024-11-12` + +### Bug fixes + +- Remove upper pin on `dask` and exclude versions broken with sparse indexing {user}`ilan-gold` ({pr}`1725`) +- Fix chunking with -1 in `chunks` argument of {func}`~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` ({pr}`1743`) +- Fix `cupy<0.13` imports in non-gpu environments {user}`ilan-gold` ({pr}`1754`) diff --git a/docs/release-notes/0.5.0.md b/docs/release-notes/0.5.0.md index 554a7e5f7..d22e1bb24 100644 --- a/docs/release-notes/0.5.0.md +++ b/docs/release-notes/0.5.0.md @@ -5,6 +5,6 @@ - automatically remove unused categories after slicing - read/write [.loom](https://loompy.org) files using loompy 2 - fixed read/write for a few text file formats -- read [UMI tools] files: {func}`~anndata.read_umi_tools` +- read [UMI tools] files: {func}`~anndata.io.read_umi_tools` [umi tools]: https://github.com/CGATOxford/UMI-tools diff --git a/docs/release-notes/0.6.x.md b/docs/release-notes/0.6.x.md index a16984372..340343c7a 100644 --- a/docs/release-notes/0.6.x.md +++ b/docs/release-notes/0.6.x.md @@ -15,9 +15,9 @@ `0.6.16` {smaller}`A Wolf` - maintain dtype upon copy. `0.6.13` {smaller}`A Wolf` -- {attr}`~anndata.AnnData.layers` inspired by [.loom](https://loompy.org) files allows their information lossless reading via {func}`~anndata.read_loom`. +- {attr}`~anndata.AnnData.layers` inspired by [.loom](https://loompy.org) files allows their information lossless reading via {func}`~anndata.io.read_loom`. `0.6.7`–`0.6.9` {pr}`46` & {pr}`48` {smaller}`S Rybakov` -- support for reading zarr files: {func}`~anndata.read_zarr` +- support for reading zarr files: {func}`~anndata.io.read_zarr` `0.6.7` {pr}`38` {smaller}`T White` - initialization from pandas DataFrames `0.6.` {smaller}`A Wolf` diff --git a/docs/release-notes/0.7.6.md b/docs/release-notes/0.7.6.md index 5bd2779ab..2dd2e54d1 100644 --- a/docs/release-notes/0.7.6.md +++ b/docs/release-notes/0.7.6.md @@ -1,7 +1,7 @@ (v0.7.6)= ### 0.7.6 {small}`11 April, 2021` -#### New features +#### Features - Added {meth}`anndata.AnnData.to_memory` for returning an in memory object from a backed one {pr}`470` {pr}`542` {smaller}`V Bergen` {smaller}`I Virshup` - {meth}`anndata.AnnData.write_loom` now writes `obs_names` and `var_names` using the `Index`'s `.name` attribute, if set {pr}`538` {smaller}`I Virshup` @@ -18,5 +18,5 @@ #### Deprecations -- Passing positional arguments to {func}`anndata.read_loom` besides the path is now deprecated {pr}`538` {smaller}`I Virshup` -- {func}`anndata.read_loom` arguments `obsm_names` and `varm_names` are now deprecated in favour of `obsm_mapping` and `varm_mapping` {pr}`538` {smaller}`I Virshup` +- Passing positional arguments to {func}`anndata.io.read_loom` besides the path is now deprecated {pr}`538` {smaller}`I Virshup` +- {func}`anndata.io.read_loom` arguments `obsm_names` and `varm_names` are now deprecated in favour of `obsm_mapping` and `varm_mapping` {pr}`538` {smaller}`I Virshup` diff --git a/docs/release-notes/0.8.0.md b/docs/release-notes/0.8.0.md index 0bda4382d..ee5967a9c 100644 --- a/docs/release-notes/0.8.0.md +++ b/docs/release-notes/0.8.0.md @@ -15,14 +15,14 @@ This should make it much easier to support new datatypes, use partial access, an - Each element should be tagged with an `encoding_type` and `encoding_version`. See updated docs on the {doc}`file format ` - Support for nullable integer and boolean data arrays. More data types to come! -- Experimental support for low level access to the IO API via {func}`~anndata.experimental.read_elem` and {func}`~anndata.experimental.write_elem` +- Experimental support for low level access to the IO API via {func}`~anndata.io.read_elem` and {func}`~anndata.io.write_elem` #### Features - Added PyTorch dataloader {class}`~anndata.experimental.AnnLoader` and lazy concatenation object {class}`~anndata.experimental.AnnCollection`. See the [tutorials] {pr}`416` {smaller}`S Rybakov` - Compatibility with `h5ad` files written from Julia {pr}`569` {smaller}`I Kats` - Many logging messages that should have been warnings are now warnings {pr}`650` {smaller}`I Virshup` -- Significantly more efficient {func}`anndata.read_umi_tools` {pr}`661` {smaller}`I Virshup` +- Significantly more efficient {func}`anndata.io.read_umi_tools` {pr}`661` {smaller}`I Virshup` - Fixed deepcopy of a copy of a view retaining sparse matrix view mixin type {pr}`670` {smaller}`M Klein` - In many cases {attr}`~anndata.AnnData.X` can now be `None` {pr}`463` {smaller}`R Cannoodt` {pr}`677` {smaller}`I Virshup`. Remaining work is documented in {issue}`467`. - Removed hard `xlrd` dependency {smaller}`I Virshup` diff --git a/docs/release-notes/0.9.0.md b/docs/release-notes/0.9.0.md index d38c7f78c..3481ade4c 100644 --- a/docs/release-notes/0.9.0.md +++ b/docs/release-notes/0.9.0.md @@ -21,7 +21,7 @@ - {doc}`/interoperability`: new page on interoperability with other packages {pr}`831` {user}`ivirshup` -- Expanded docstring more documentation for `backed` argument of {func}`anndata.read_h5ad` {pr}`812` {user}`jeskowagner` +- Expanded docstring more documentation for `backed` argument of {func}`anndata.io.read_h5ad` {pr}`812` {user}`jeskowagner` - Documented how to use alternative compression methods for the `h5ad` file format, see {meth}`AnnData.write_h5ad() ` {pr}`857` {user}`nigeil` diff --git a/docs/release-notes/0.9.1.md b/docs/release-notes/0.9.1.md index f90672d60..383085122 100644 --- a/docs/release-notes/0.9.1.md +++ b/docs/release-notes/0.9.1.md @@ -1,6 +1,6 @@ (v0.9.1)= ### 0.9.1 {small}`2023-04-11` -#### Bugfix +#### Bug fixes * Fixing windows support {pr}`958` {user}`Koncopd` diff --git a/docs/release-notes/0.9.2.md b/docs/release-notes/0.9.2.md index 286f43b3d..de88c29a8 100644 --- a/docs/release-notes/0.9.2.md +++ b/docs/release-notes/0.9.2.md @@ -1,9 +1,9 @@ (v0.9.2)= ### 0.9.2 {small}`2023-07-25` -#### Bugfix +#### Bug fixes * Views of `awkward.Array`s now work with `awkward>=2.3` {pr}`1040` {user}`ivirshup` * Fix ufuncs of views like `adata.X[:10].cov(axis=0)` returning views {pr}`1043` {user}`flying-sheep` * Fix instantiating AnnData where `.X` is a `DataFrame` with an integer valued index {pr}`1002` {user}`flying-sheep` -* Fix {func}`~anndata.read_zarr` when used on `zarr.Group` {pr}`1057` {user}`ivirshup` +* Fix {func}`~anndata.io.read_zarr` when used on `zarr.Group` {pr}`1057` {user}`ivirshup` diff --git a/docs/release-notes/1028.feature.md b/docs/release-notes/1028.feature.md deleted file mode 100644 index 8b2f612f9..000000000 --- a/docs/release-notes/1028.feature.md +++ /dev/null @@ -1 +0,0 @@ -`scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {user}`ilan-gold` {user}`isaac-virshup` diff --git a/docs/release-notes/1197.breaking.md b/docs/release-notes/1197.breaking.md deleted file mode 100644 index 165e712cc..000000000 --- a/docs/release-notes/1197.breaking.md +++ /dev/null @@ -1 +0,0 @@ -Removed deprecated modules `anndata.core` and `anndata.readwrite` {user}`ivirshup` diff --git a/docs/release-notes/1244.feature.md b/docs/release-notes/1244.feature.md deleted file mode 100644 index 9a0fd6c30..000000000 --- a/docs/release-notes/1244.feature.md +++ /dev/null @@ -1 +0,0 @@ -Allow `axis` parameter of e.g. :func:`anndata.concat` to accept `'obs'` and `'var'` {user}`flying-sheep` diff --git a/docs/release-notes/1270.feature.md b/docs/release-notes/1270.feature.md deleted file mode 100644 index 89f07264c..000000000 --- a/docs/release-notes/1270.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {user}`ilan-gold` diff --git a/docs/release-notes/1340.feature.md b/docs/release-notes/1340.feature.md deleted file mode 100644 index 8991d630f..000000000 --- a/docs/release-notes/1340.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add {attr}`~anndata.settings.should_remove_unused_categories` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` diff --git a/docs/release-notes/1469.feature.md b/docs/release-notes/1469.feature.md deleted file mode 100644 index abe84f7f6..000000000 --- a/docs/release-notes/1469.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add :func:`~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {user}`ilan-gold` diff --git a/docs/release-notes/1474.feature.md b/docs/release-notes/1474.feature.md deleted file mode 100644 index 9c85d982f..000000000 --- a/docs/release-notes/1474.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add ability to convert strings to categoricals on write in {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` via `convert_strings_to_categoricals` parameter {user}` falexwolf` diff --git a/docs/release-notes/1507.feature.md b/docs/release-notes/1507.feature.md deleted file mode 100644 index 13c6224ef..000000000 --- a/docs/release-notes/1507.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add {attr}`~anndata.settings.should_check_uniqueness` option to {attr}`anndata.settings` to override current behavior {user}`ilan-gold` diff --git a/docs/release-notes/1550.feature.md b/docs/release-notes/1550.feature.md deleted file mode 100644 index bd1bfd37d..000000000 --- a/docs/release-notes/1550.feature.md +++ /dev/null @@ -1 +0,0 @@ -Add functionality to write from GPU {class}`dask.array.Array` to disk {user}`ilan-gold` diff --git a/docs/release-notes/1596.dev.md b/docs/release-notes/1596.dev.md deleted file mode 100644 index e1b3492ed..000000000 --- a/docs/release-notes/1596.dev.md +++ /dev/null @@ -1 +0,0 @@ -create new `cupy` installation options for cuda 11 & 12 called `cu11` and `cu12` {user}`Intron7` diff --git a/hatch.toml b/hatch.toml index ad888c3bb..738056567 100644 --- a/hatch.toml +++ b/hatch.toml @@ -4,8 +4,26 @@ features = ["dev"] [envs.docs] features = ["doc"] -dependencies = ["setuptools"] # https://bitbucket.org/pybtex-devs/pybtex/issues/169 +extra-dependencies = ["setuptools"] # https://bitbucket.org/pybtex-devs/pybtex/issues/169 +scripts.build = "sphinx-build -M html docs docs/_build -W --keep-going {args}" +scripts.clean = "git clean -fdX -- {args:docs}" -[envs.docs.scripts] -build = "sphinx-build -M html docs docs/_build -W --keep-going {args}" -clean = "git clean -fX -- docs" +[envs.towncrier] +scripts.build = "python3 ci/scripts/towncrier_automation.py {args}" +scripts.clean = "git restore --source=HEAD --staged --worktree -- docs/release-notes" + +[envs.hatch-test] +default-args = [] +extra-dependencies = ["ipykernel"] +features = ["dev", "test"] +overrides.matrix.deps.env-vars = [ + { key = "UV_PRERELEASE", value = "allow", if = ["pre"] }, + { key = "UV_RESOLUTION", value = "lowest-direct", if = ["min"] }, +] +overrides.matrix.deps.python = [ + { if = ["min"], value = "3.10" }, + { if = ["stable", "pre"], value = "3.12" }, +] + +[[envs.hatch-test.matrix]] +deps = ["stable", "pre", "min"] diff --git a/pyproject.toml b/pyproject.toml index 44fa24401..3cc1b31a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = ["hatchling", "hatch-vcs"] [project] name = "anndata" description = "Annotated data." -requires-python = ">=3.9" +requires-python = ">=3.10" license = "BSD-3-Clause" authors = [ { name = "Philipp Angerer" }, @@ -29,7 +29,6 @@ classifiers = [ "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -43,7 +42,7 @@ dependencies = [ "numpy>=1.23", # https://github.com/scverse/anndata/issues/1434 "scipy >1.8", - "h5py>=3.1", + "h5py>=3.6", "exceptiongroup; python_version<'3.11'", "natsort", "packaging>=20.0", @@ -61,10 +60,8 @@ Home-page = "https://github.com/scverse/anndata" [project.optional-dependencies] dev = [ # dev version generation - "setuptools_scm", - # test speedups - "pytest-xdist", - "towncrier>=24.8.0", + "setuptools-scm", + "anndata[dev-doc,dev-test]", ] doc = [ "sphinx>=7.4.6", @@ -72,17 +69,20 @@ doc = [ "sphinx-autodoc-typehints>=2.2.0", "sphinx-issues", "sphinx-copybutton", - "sphinx-toolbox", + "sphinx-toolbox>=3.8.0", "sphinxext.opengraph", "nbsphinx", - "scanpydoc[theme,typehints] >=0.13.6", + "scanpydoc[theme,typehints] >=0.14.1", "zarr", "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks "myst_parser", "sphinx_design>=0.5.0", "readthedocs-sphinx-search", + # for unreleased changes + "anndata[dev-doc]", ] +dev-doc = ["towncrier>=24.8.0"] # release notes tool test = [ "loompy>=3.0.5", "pytest>=8.2", @@ -95,26 +95,34 @@ test = [ "boltons", "scanpy", "httpx", # For data downloading - "dask[array,distributed]>=2022.09.2,<2024.8.0", + "dask[distributed]", "awkward>=2.3", "pyarrow", "pytest_memray", - "pytest-mock" + "pytest-mock", + "anndata[dask]", ] +dev-test = ["pytest-xdist"] # local test speedups gpu = ["cupy"] cu12 = ["cupy-cuda12x"] cu11 = ["cupy-cuda11x"] +# https://github.com/dask/dask/issues/11290 +dask = ["dask[array]>=2022.09.2,!=2024.8.*,!=2024.9.*"] [tool.hatch.version] source = "vcs" [tool.hatch.build.hooks.vcs] version-file = "src/anndata/_version.py" +raw-options.version_scheme = "release-branch-semver" [tool.hatch.build.targets.wheel] packages = ["src/anndata", "src/testing"] [tool.coverage.run] +data_file = "test-data/coverage" source_pkgs = ["anndata"] omit = ["src/anndata/_version.py", "**/test_*.py"] +[tool.coverage.xml] +output = "test-data/coverage.xml" [tool.coverage.paths] source = ["./src", "**/site-packages"] @@ -165,12 +173,15 @@ select = [ "E", # Error detected by Pycodestyle "F", # Errors detected by Pyflakes "W", # Warning detected by Pycodestyle + "PLW", # Pylint "UP", # pyupgrade "I", # isort "TCH", # manage type checking blocks + "TID", # Banned imports "ICN", # Follow import conventions "PTH", # Pathlib instead of os.path "PT", # Pytest conventions + "PYI", # Typing ] ignore = [ # line too long -> we accept long comment lines; formatter gets rid of long code lines @@ -179,6 +190,10 @@ ignore = [ "E731", # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation "E741", + # We use relative imports from parent modules + "TID252", + # Shadowing loop variables isn’t a big deal + "PLW2901", ] [tool.ruff.lint.per-file-ignores] # E721 comparing types, but we specifically are checking that we aren't getting subtypes (views) @@ -186,6 +201,10 @@ ignore = [ [tool.ruff.lint.isort] known-first-party = ["anndata"] required-imports = ["from __future__ import annotations"] +[tool.ruff.lint.flake8-tidy-imports.banned-api] +"subprocess.call".msg = "Use `subprocess.run([…])` instead" +"subprocess.check_call".msg = "Use `subprocess.run([…], check=True)` instead" +"subprocess.check_output".msg = "Use `subprocess.run([…], check=True, capture_output=True)` instead" [tool.ruff.lint.flake8-type-checking] exempt-modules = [] strict = true @@ -202,16 +221,10 @@ single_file = false package_dir = "src" issue_format = "{{pr}}`{issue}`" title_format = "(v{version})=\n### {version} {{small}}`{project_date}`" -[tool.towncrier.fragment.bugfix] -[tool.towncrier.fragment.doc] -[tool.towncrier.fragment.feature] -[tool.towncrier.fragment.misc] - -[tool.towncrier.fragment.performance] -name = "Performance" - -[tool.towncrier.fragment.breaking] -name = "Breaking" - -[tool.towncrier.fragment.dev] -name = "Development Process" +fragment.bugfix.name = "Bug fixes" +fragment.doc.name = "Documentation" +fragment.feature.name = "Features" +fragment.misc.name = "Miscellaneous improvements" +fragment.performance.name = "Performance" +fragment.breaking.name = "Breaking changes" +fragment.dev.name = "Development Process" diff --git a/src/anndata/__init__.py b/src/anndata/__init__.py index c2006cd72..fec027c87 100644 --- a/src/anndata/__init__.py +++ b/src/anndata/__init__.py @@ -2,6 +2,11 @@ from __future__ import annotations +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import Any + try: # See https://github.com/maresb/hatch-vcs-footgun-example from setuptools_scm import get_version @@ -24,17 +29,6 @@ from ._core.anndata import AnnData from ._core.merge import concat from ._core.raw import Raw -from ._io import ( - read_csv, - read_excel, - read_h5ad, - read_hdf, - read_loom, - read_mtx, - read_text, - read_umi_tools, - read_zarr, -) from ._settings import settings from ._warnings import ( ExperimentalFeatureWarning, @@ -42,12 +36,14 @@ OldFormatWarning, WriteWarning, ) +from .io import read_h5ad, read_zarr +from .utils import module_get_attr_redirect -# Experimental needs to be imported last -from . import experimental # isort: skip +# Submodules need to be imported last +from . import abc, experimental, typing, io # noqa: E402 isort: skip # We use these in tests by attribute access -from . import _io, logging # noqa: F401 isort: skip +from . import logging # noqa: F401, E402 isort: skip def read(*args, **kwargs): @@ -61,12 +57,7 @@ def read(*args, **kwargs): return read_h5ad(*args, **kwargs) -__all__ = [ - "__version__", - "AnnData", - "concat", - "Raw", - "read_h5ad", +_DEPRECATED_IO = ( "read_loom", "read_hdf", "read_excel", @@ -74,11 +65,34 @@ def read(*args, **kwargs): "read_csv", "read_text", "read_mtx", +) +_DEPRECATED = dict((method, f"io.{method}") for method in _DEPRECATED_IO) + + +def __getattr__(attr_name: str) -> Any: + return module_get_attr_redirect(attr_name, deprecated_mapping=_DEPRECATED) + + +__all__ = [ + # Attributes + "__version__", + "settings", + # Submodules + "abc", + "experimental", + "typing", + "io", + # Classes + "AnnData", + "Raw", + # Functions + "concat", "read_zarr", + "read_h5ad", + "read", + # Warnings "OldFormatWarning", "WriteWarning", "ImplicitModificationWarning", "ExperimentalFeatureWarning", - "experimental", - "settings", ] diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index e2f6e4352..9df5ac977 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -5,7 +5,7 @@ from collections.abc import MutableMapping, Sequence from copy import copy from dataclasses import dataclass -from typing import TYPE_CHECKING, Generic, TypeVar, Union +from typing import TYPE_CHECKING, Generic, TypeVar import numpy as np import pandas as pd @@ -33,10 +33,10 @@ from .raw import Raw -OneDIdx = Union[Sequence[int], Sequence[bool], slice] +OneDIdx = Sequence[int] | Sequence[bool] | slice TwoDIdx = tuple[OneDIdx, OneDIdx] # TODO: pd.DataFrame only allowed in AxisArrays? -Value = Union[pd.DataFrame, spmatrix, np.ndarray] +Value = pd.DataFrame | spmatrix | np.ndarray P = TypeVar("P", bound="AlignedMappingBase") """Parent mapping an AlignedView is based on.""" @@ -376,9 +376,14 @@ class PairwiseArraysView(AlignedView[PairwiseArraysBase, OneDIdx], PairwiseArray PairwiseArraysBase._actual_class = PairwiseArrays -AlignedMapping = Union[ - AxisArrays, AxisArraysView, Layers, LayersView, PairwiseArrays, PairwiseArraysView -] +AlignedMapping = ( + AxisArrays + | AxisArraysView + | Layers + | LayersView + | PairwiseArrays + | PairwiseArraysView +) T = TypeVar("T", bound=AlignedMapping) """Pair of types to be aligned.""" @@ -408,9 +413,7 @@ def fget(self) -> Callable[[], None]: def fake(): ... - fake.__annotations__ = { - "return": Union[self.cls._actual_class, self.cls._view_class] - } + fake.__annotations__ = {"return": self.cls._actual_class | self.cls._view_class} return fake def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T: diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 49d47665d..8a8eaf949 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -52,9 +52,10 @@ from os import PathLike from typing import Any, Literal + from ..compat import Index1D + from ..typing import ArrayDataStructureType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView - from .index import Index, Index1D - from .views import ArrayView + from .index import Index # for backwards compat @@ -134,15 +135,15 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): See Also -------- - read_h5ad - read_csv - read_excel - read_hdf - read_loom - read_zarr - read_mtx - read_text - read_umi_tools + io.read_h5ad + io.read_csv + io.read_excel + io.read_hdf + io.read_loom + io.read_zarr + io.read_mtx + io.read_text + io.read_umi_tools Notes ----- @@ -272,12 +273,12 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): "that is, you cannot make a view of a view." ) self._is_view = True - if isinstance(oidx, (int, np.integer)): + if isinstance(oidx, int | np.integer): if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): raise IndexError(f"Observation index `{oidx}` is out of range.") oidx += adata_ref.n_obs * (oidx < 0) oidx = slice(oidx, oidx + 1, 1) - if isinstance(vidx, (int, np.integer)): + if isinstance(vidx, int | np.integer): if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars): raise IndexError(f"Variable index `{vidx}` is out of range.") vidx += adata_ref.n_vars * (vidx < 0) @@ -297,7 +298,7 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): var_sub = adata_ref.var.iloc[vidx] # fix categories uns = copy(adata_ref._uns) - if settings.should_remove_unused_categories: + if settings.remove_unused_categories: self._remove_unused_categories(adata_ref.obs, obs_sub, uns) self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes @@ -406,7 +407,7 @@ def _init_as_actual( # as in readwrite.read_10x_h5 if X.dtype != np.dtype(dtype): X = X.astype(dtype) - elif isinstance(X, (ZarrArray, DaskArray)): + elif isinstance(X, ZarrArray | DaskArray): X = X.astype(dtype) else: # is np.ndarray or a subclass, convert to true np.ndarray X = np.asarray(X, dtype) @@ -447,7 +448,7 @@ def _init_as_actual( # Backwards compat for connectivities matrices in uns["neighbors"] _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) self._check_dimensions() - if settings.should_check_uniqueness: + if settings.check_uniqueness: self._check_uniqueness() if self.filename: @@ -541,7 +542,7 @@ def shape(self) -> tuple[int, int]: return self.n_obs, self.n_vars @property - def X(self) -> np.ndarray | sparse.spmatrix | SpArray | ArrayView | None: + def X(self) -> ArrayDataStructureType | None: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" if self.isbacked: if not self.file.is_open: @@ -696,7 +697,7 @@ def raw(self) -> Raw: The :attr:`raw` attribute is initialized with the current content of an object by setting:: - adata.raw = adata + adata.raw = adata.copy() Its content can be deleted:: @@ -763,16 +764,14 @@ def _prep_dim_index(self, value, attr: str) -> pd.Index: raise ValueError( f"Length of passed value for {attr}_names is {len(value)}, but this AnnData has shape: {self.shape}" ) - if isinstance(value, pd.Index) and not isinstance( - value.name, (str, type(None)) - ): + if isinstance(value, pd.Index) and not isinstance(value.name, str | type(None)): raise ValueError( f"AnnData expects .{attr}.index.name to be a string or None, " f"but you passed a name of type {type(value.name).__name__!r}" ) else: value = pd.Index(value) - if not isinstance(value.name, (str, type(None))): + if not isinstance(value.name, str | type(None)): value.name = None if ( len(value) > 0 @@ -1170,9 +1169,7 @@ def _inplace_subset_obs(self, index: Index1D): self._init_as_actual(adata_subset) # TODO: Update, possibly remove - def __setitem__( - self, index: Index, val: int | float | np.ndarray | sparse.spmatrix - ): + def __setitem__(self, index: Index, val: float | np.ndarray | sparse.spmatrix): if self.is_view: raise ValueError("Object is view and cannot be accessed with `[]`.") obs, var = self._normalize_indices(index) @@ -1399,7 +1396,7 @@ def to_memory(self, copy=False) -> AnnData: .. code:: python import anndata - backed = anndata.read_h5ad("file.h5ad", backed="r") + backed = anndata.io.read_h5ad("file.h5ad", backed="r") mem = backed[backed.obs["cluster"] == "a", :].to_memory() """ new = {} @@ -1444,7 +1441,7 @@ def copy(self, filename: PathLike | None = None) -> AnnData: else: return self._mutated_copy() else: - from .._io import read_h5ad, write_h5ad + from ..io import read_h5ad, write_h5ad if filename is None: raise ValueError( @@ -1858,7 +1855,7 @@ def write_h5ad( Sparse arrays in AnnData object to write as dense. Currently only supports `X` and `raw/X`. """ - from .._io import write_h5ad + from ..io import write_h5ad if filename is None and not self.isbacked: raise ValueError("Provide a filename!") @@ -1894,7 +1891,7 @@ def write_csvs(self, dirname: PathLike, skip_data: bool = True, sep: str = ","): sep Separator for the data. """ - from .._io import write_csvs + from ..io import write_csvs write_csvs(dirname, self, skip_data=skip_data, sep=sep) @@ -1907,7 +1904,7 @@ def write_loom(self, filename: PathLike, write_obsm_varm: bool = False): filename The filename. """ - from .._io import write_loom + from ..io import write_loom write_loom(filename, self, write_obsm_varm=write_obsm_varm) @@ -1926,7 +1923,7 @@ def write_zarr( chunks Chunk shape. """ - from .._io import write_zarr + from ..io import write_zarr write_zarr(store, self, chunks=chunks) @@ -1976,7 +1973,7 @@ def chunk_X( if isinstance(select, int): select = select if select < self.n_obs else self.n_obs choice = np.random.choice(self.n_obs, select, replace) - elif isinstance(select, (np.ndarray, Sequence)): + elif isinstance(select, np.ndarray | Sequence): choice = np.asarray(select) else: raise ValueError("select should be int or array") diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 6a5e2fc39..f1d72ce0d 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -26,14 +26,8 @@ def _normalize_indices( if isinstance(index, pd.Series): index: Index = index.values if isinstance(index, tuple): - if len(index) > 2: - raise ValueError("AnnData can only be sliced in rows and columns.") - # deal with pd.Series # TODO: The series should probably be aligned first - if isinstance(index[1], pd.Series): - index = index[0], index[1].values - if isinstance(index[0], pd.Series): - index = index[0].values, index[1] + index = tuple(i.values if isinstance(i, pd.Series) else i for i in index) ax0, ax1 = unpack_index(index) ax0 = _normalize_index(ax0, names0) ax1 = _normalize_index(ax1, names1) @@ -70,25 +64,25 @@ def name_idx(i): stop = None if stop is None else stop + 1 step = indexer.step return slice(start, stop, step) - elif isinstance(indexer, (np.integer, int)): + elif isinstance(indexer, np.integer | int): return indexer elif isinstance(indexer, str): return index.get_loc(indexer) # int elif isinstance( - indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix, SpArray) + indexer, Sequence | np.ndarray | pd.Index | spmatrix | np.matrix | SpArray ): if hasattr(indexer, "shape") and ( (indexer.shape == (index.shape[0], 1)) or (indexer.shape == (1, index.shape[0])) ): - if isinstance(indexer, (spmatrix, SpArray)): + if isinstance(indexer, spmatrix | SpArray): indexer = indexer.toarray() indexer = np.ravel(indexer) - if not isinstance(indexer, (np.ndarray, pd.Index)): + if not isinstance(indexer, np.ndarray | pd.Index): indexer = np.array(indexer) if len(indexer) == 0: indexer = indexer.astype(int) - if issubclass(indexer.dtype.type, (np.integer, np.floating)): + if issubclass(indexer.dtype.type, np.integer | np.floating): return indexer # Might not work for range indexes elif issubclass(indexer.dtype.type, np.bool_): if indexer.shape != index.shape: @@ -107,8 +101,7 @@ def name_idx(i): "are not valid obs/ var names or indices." ) return positions # np.ndarray[int] - else: - raise IndexError(f"Unknown indexer {indexer!r} of type {type(indexer)}") + raise IndexError(f"Unknown indexer {indexer!r} of type {type(indexer)}") def _fix_slice_bounds(s: slice, length: int) -> slice: @@ -132,13 +125,28 @@ def _fix_slice_bounds(s: slice, length: int) -> slice: def unpack_index(index: Index) -> tuple[Index1D, Index1D]: if not isinstance(index, tuple): + if index is Ellipsis: + index = slice(None) return index, slice(None) - elif len(index) == 2: + num_ellipsis = sum(i is Ellipsis for i in index) + if num_ellipsis > 1: + raise IndexError("an index can only have a single ellipsis ('...')") + # If index has Ellipsis, filter it out (and if not, error) + if len(index) > 2: + if not num_ellipsis: + raise IndexError("Received a length 3 index without an ellipsis") + index = tuple(i for i in index if i is not Ellipsis) return index - elif len(index) == 1: - return index[0], slice(None) - else: - raise IndexError("invalid number of indices") + # If index has Ellipsis, replace it with slice + if len(index) == 2: + index = tuple(slice(None) if i is Ellipsis else i for i in index) + return index + if len(index) == 1: + index = index[0] + if index is Ellipsis: + index = slice(None) + return index, slice(None) + raise IndexError("invalid number of indices") @singledispatch diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 85d5b31ca..0dfa5dab2 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -174,7 +174,7 @@ def equal_sparse(a, b) -> bool: xp = array_api_compat.array_namespace(a.data) - if isinstance(b, (CupySparseMatrix, sparse.spmatrix, SpArray)): + if isinstance(b, CupySparseMatrix | sparse.spmatrix | SpArray): if isinstance(a, CupySparseMatrix): # Comparison broken for CSC matrices # https://github.com/cupy/cupy/issues/7757 @@ -206,7 +206,7 @@ def equal_awkward(a, b) -> bool: def as_sparse(x, use_sparse_array=False): - if not isinstance(x, (sparse.spmatrix, SpArray)): + if not isinstance(x, sparse.spmatrix | SpArray): if CAN_USE_SPARSE_ARRAY and use_sparse_array: return sparse.csr_array(x) return sparse.csr_matrix(x) @@ -536,7 +536,7 @@ def apply(self, el, *, axis, fill_value=None): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) - elif isinstance(el, (sparse.spmatrix, SpArray, CupySparseMatrix)): + elif isinstance(el, sparse.spmatrix | SpArray | CupySparseMatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) elif isinstance(el, AwkArray): return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) @@ -723,7 +723,14 @@ def default_fill_value(els): This is largely due to backwards compat, and might not be the ideal solution. """ - if any(isinstance(el, (sparse.spmatrix, SpArray)) for el in els): + if any( + isinstance(el, sparse.spmatrix | SpArray) + or ( + isinstance(el, DaskArray) + and isinstance(el._meta, sparse.spmatrix | SpArray) + ) + for el in els + ): return 0 else: return np.nan @@ -737,8 +744,8 @@ def gen_reindexer(new_var: pd.Index, cur_var: pd.Index): Usage ----- - >>> a = AnnData(sparse.eye(3), var=pd.DataFrame(index=list("abc"))) - >>> b = AnnData(sparse.eye(2), var=pd.DataFrame(index=list("ba"))) + >>> a = AnnData(sparse.eye(3, format="csr"), var=pd.DataFrame(index=list("abc"))) + >>> b = AnnData(sparse.eye(2, format="csr"), var=pd.DataFrame(index=list("ba"))) >>> reindexer = gen_reindexer(a.var_names, b.var_names) >>> sparse.vstack([a.X, reindexer(b.X)]).toarray() array([[1., 0., 0.], @@ -794,7 +801,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): import cupyx.scipy.sparse as cpsparse if not all( - isinstance(a, (CupySparseMatrix, CupyArray)) or 0 in a.shape for a in arrays + isinstance(a, CupySparseMatrix | CupyArray) or 0 in a.shape for a in arrays ): raise NotImplementedError( "Cannot concatenate a cupy array with other array types." @@ -821,7 +828,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ], axis=axis, ) - elif any(isinstance(a, (sparse.spmatrix, SpArray)) for a in arrays): + elif any(isinstance(a, sparse.spmatrix | SpArray) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] use_sparse_array = any(issubclass(type(a), SpArray) for a in arrays) return sparse_stack( @@ -980,7 +987,7 @@ def concat_pairwise_mapping( els = [ m.get(k, sparse_class((s, s), dtype=bool)) for m, s in zip(mappings, shapes) ] - if all(isinstance(el, (CupySparseMatrix, CupyArray)) for el in els): + if all(isinstance(el, CupySparseMatrix | CupyArray) for el in els): result[k] = _cp_block_diag(els, format="csr") elif all(isinstance(el, DaskArray) for el in els): result[k] = _dask_block_diag(els) diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py index 7237c06b4..d138440b5 100644 --- a/src/anndata/_core/raw.py +++ b/src/anndata/_core/raw.py @@ -40,7 +40,7 @@ def __init__( # construct manually if adata.isbacked == (X is None): # Move from GPU to CPU since it's large and not always used - if isinstance(X, (CupyArray, CupySparseMatrix)): + if isinstance(X, CupyArray | CupySparseMatrix): self._X = X.get() else: self._X = X @@ -51,7 +51,7 @@ def __init__( self.varm = varm elif X is None: # construct from adata # Move from GPU to CPU since it's large and not always used - if isinstance(adata.X, (CupyArray, CupySparseMatrix)): + if isinstance(adata.X, CupyArray | CupySparseMatrix): self._X = adata.X.get() else: self._X = adata.X.copy() @@ -124,9 +124,9 @@ def __getitem__(self, index): oidx, vidx = self._normalize_indices(index) # To preserve two dimensional shape - if isinstance(vidx, (int, np.integer)): + if isinstance(vidx, int | np.integer): vidx = slice(vidx, vidx + 1, 1) - if isinstance(oidx, (int, np.integer)): + if isinstance(oidx, int | np.integer): oidx = slice(oidx, oidx + 1, 1) if not self._adata.isbacked: diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index 12a5ef19b..ae6b47c7f 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -26,26 +26,21 @@ import scipy.sparse as ss from scipy.sparse import _sparsetools -from anndata._core.index import _fix_slice_bounds -from anndata.compat import H5Group, ZarrArray, ZarrGroup - -from ..compat import SpArray, _read_attr - -try: - # Not really important, just for IDEs to be more helpful - from scipy.sparse._compressed import _cs_matrix -except ImportError: - from scipy.sparse import spmatrix as _cs_matrix - - -from .index import _subset, unpack_index +from .. import abc +from .._settings import settings +from ..compat import H5Group, SpArray, ZarrArray, ZarrGroup, _read_attr +from .index import _fix_slice_bounds, _subset, unpack_index if TYPE_CHECKING: from collections.abc import Sequence from typing import Literal + from scipy.sparse._compressed import _cs_matrix + from .._types import GroupStorageType from .index import Index +else: + from scipy.sparse import spmatrix as _cs_matrix class BackedFormat(NamedTuple): @@ -234,6 +229,8 @@ def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: FORMATS = [ BackedFormat("csr", backed_csr_matrix, ss.csr_matrix), BackedFormat("csc", backed_csc_matrix, ss.csc_matrix), + BackedFormat("csr", backed_csr_matrix, ss.csr_array), + BackedFormat("csc", backed_csc_matrix, ss.csc_array), ] @@ -346,25 +343,19 @@ def _get_group_format(group: GroupStorageType) -> str: def is_sparse_indexing_overridden(format: Literal["csr", "csc"], row, col): major_indexer, minor_indexer = (row, col) if format == "csr" else (col, row) return isinstance(minor_indexer, slice) and ( - (isinstance(major_indexer, (int, np.integer))) + (isinstance(major_indexer, int | np.integer)) or (isinstance(major_indexer, slice)) or (isinstance(major_indexer, np.ndarray) and major_indexer.ndim == 1) ) -class BaseCompressedSparseDataset(ABC): - """Analogous to :class:`h5py.Dataset ` or `zarr.Array`, but for sparse matrices.""" - - format: Literal["csr", "csc"] +class BaseCompressedSparseDataset(abc._AbstractCSDataset, ABC): _group: GroupStorageType def __init__(self, group: GroupStorageType): type(self)._check_group_format(group) self._group = group - shape: tuple[int, int] - """Shape of the matrix.""" - @property def group(self) -> GroupStorageType: """The group underlying the backed matrix.""" @@ -378,6 +369,7 @@ def group(self, val): @property def backend(self) -> Literal["zarr", "hdf5"]: + """Which file type is used on-disk.""" if isinstance(self.group, ZarrGroup): return "zarr" elif isinstance(self.group, H5Group): @@ -387,6 +379,7 @@ def backend(self) -> Literal["zarr", "hdf5"]: @property def dtype(self) -> np.dtype: + """The :class:`numpy.dtype` of the `data` attribute of the sparse matrix.""" return self.group["data"].dtype @classmethod @@ -395,43 +388,26 @@ def _check_group_format(cls, group): assert group_format == cls.format @property - def format_str(self) -> Literal["csr", "csc"]: - """DEPRECATED Use .format instead.""" - warnings.warn( - "The attribute .format_str is deprecated and will be removed in the anndata 0.11.0. " - "Please use .format instead.", - FutureWarning, - ) - return self.format - - @property - def name(self) -> str: + def _name(self) -> str: + """Name of the group.""" return self.group.name @property def shape(self) -> tuple[int, int]: + """Shape of the matrix read off disk.""" shape = _read_attr(self.group.attrs, "shape", None) if shape is None: # TODO warn shape = self.group.attrs.get("h5sparse_shape") return tuple(map(int, shape)) - @property - def value(self) -> ss.csr_matrix | ss.csc_matrix: - """DEPRECATED Use .to_memory() instead.""" - warnings.warn( - "The .value attribute is deprecated and will be removed in the anndata 0.11.0. " - "Please use .to_memory() instead.", - FutureWarning, - ) - return self.to_memory() - def __repr__(self) -> str: - return f"{type(self).__name__}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}" + name = type(self).__name__.removeprefix("_") + return f"{name}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}" def __getitem__( self, index: Index | tuple[()] - ) -> float | ss.csr_matrix | ss.csc_matrix: + ) -> float | ss.csr_matrix | ss.csc_matrix | SpArray: indices = self._normalize_index(index) row, col = indices mtx = self._to_backed() @@ -458,8 +434,15 @@ def __getitem__( # If indexing is array x array it returns a backed_sparse_matrix # Not sure what the performance is on that operation - if isinstance(sub, BackedSparseMatrix): - return get_memory_class(self.format)(sub) + # Also need to check if memory format is not matrix + mtx_fmt = get_memory_class( + self.format, use_sparray_in_io=settings.use_sparse_array_on_read + ) + must_convert_to_array = issubclass(mtx_fmt, SpArray) and not isinstance( + sub, SpArray + ) + if isinstance(sub, BackedSparseMatrix) or must_convert_to_array: + return mtx_fmt(sub) else: return sub @@ -483,7 +466,25 @@ def __setitem__(self, index: Index | tuple[()], value) -> None: mock_matrix[row, col] = value # TODO: split to other classes? - def append(self, sparse_matrix: _cs_matrix | SpArray) -> None: + def append(self, sparse_matrix: ss.csr_matrix | ss.csc_matrix | SpArray) -> None: + """Append an in-memory or on-disk sparse matrix to the current object's store. + + Parameters + ---------- + sparse_matrix + The matrix to append. + + Raises + ------ + NotImplementedError + If the matrix to append is not one of :class:`~scipy.sparse.csr_array`, :class:`~scipy.sparse.csc_array`, :class:`~scipy.sparse.csr_matrix`, or :class:`~scipy.sparse.csc_matrix`. + ValueError + If both the on-disk and to-append matrices are not of the same format i.e., `csr` or `csc`. + OverflowError + If the underlying data store has a 32 bit indptr, and the new matrix is too large to fit in it i.e., would cause a 64 bit `indptr` to be written. + AssertionError + If the on-disk data does not have `csc` or `csr` format. + """ # Prep variables shape = self.shape if isinstance(sparse_matrix, BaseCompressedSparseDataset): @@ -546,7 +547,7 @@ def append(self, sparse_matrix: _cs_matrix | SpArray) -> None: ) # Clear cached property if hasattr(self, "indptr"): - del self.indptr + del self._indptr # indices indices = self.group["indices"] @@ -555,7 +556,7 @@ def append(self, sparse_matrix: _cs_matrix | SpArray) -> None: indices[orig_data_size:] = sparse_matrix.indices @cached_property - def indptr(self) -> np.ndarray: + def _indptr(self) -> np.ndarray: """\ Other than `data` and `indices`, this is only as long as the major axis @@ -569,39 +570,29 @@ def _to_backed(self) -> BackedSparseMatrix: mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"] mtx.indices = self.group["indices"] - mtx.indptr = self.indptr + mtx.indptr = self._indptr return mtx - def to_memory(self) -> ss.csr_matrix | ss.csc_matrix: - format_class = get_memory_class(self.format) + def to_memory(self) -> ss.csr_matrix | ss.csc_matrix | SpArray: + format_class = get_memory_class( + self.format, use_sparray_in_io=settings.use_sparse_array_on_read + ) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"][...] mtx.indices = self.group["indices"][...] - mtx.indptr = self.indptr + mtx.indptr = self._indptr return mtx -_sparse_dataset_doc = """\ - On disk {format} sparse matrix. +class _CSRDataset(BaseCompressedSparseDataset, abc.CSRDataset): + """Internal concrete version of :class:`anndata.abc.CSRDataset`.""" - Parameters - ---------- - group - The backing group store. -""" +class _CSCDataset(BaseCompressedSparseDataset, abc.CSCDataset): + """Internal concrete version of :class:`anndata.abc.CSRDataset`.""" -class CSRDataset(BaseCompressedSparseDataset): - __doc__ = _sparse_dataset_doc.format(format="CSR") - format = "csr" - -class CSCDataset(BaseCompressedSparseDataset): - __doc__ = _sparse_dataset_doc.format(format="CSC") - format = "csc" - - -def sparse_dataset(group: GroupStorageType) -> CSRDataset | CSCDataset: +def sparse_dataset(group: GroupStorageType) -> abc.CSRDataset | abc.CSCDataset: """Generates a backed mode-compatible sparse dataset class. Parameters @@ -620,7 +611,8 @@ def sparse_dataset(group: GroupStorageType) -> CSRDataset | CSCDataset: >>> import scanpy as sc >>> import h5py - >>> from anndata.experimental import sparse_dataset, read_elem + >>> from anndata.io import sparse_dataset + >>> from anndata.io import read_elem >>> sc.datasets.pbmc68k_reduced().raw.to_adata().write_h5ad("pbmc.h5ad") Initialize a sparse dataset from storage @@ -653,38 +645,12 @@ def sparse_dataset(group: GroupStorageType) -> CSRDataset | CSCDataset: """ encoding_type = _get_group_format(group) if encoding_type == "csr": - return CSRDataset(group) + return _CSRDataset(group) elif encoding_type == "csc": - return CSCDataset(group) + return _CSCDataset(group) + raise ValueError(f"Unknown encoding type {encoding_type}") @_subset.register(BaseCompressedSparseDataset) def subset_sparsedataset(d, subset_idx): return d[subset_idx] - - -## Backwards compat - -_sparsedataset_depr_msg = """\ -SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset. - -For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead. - -For creation, use `anndata.experimental.sparse_dataset(X)` instead. -""" - - -class SparseDataset(ABC): - """DEPRECATED. - - Use CSRDataset, CSCDataset, and sparse_dataset from anndata.experimental instead. - """ - - def __new__(cls, group): - warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=2) - return sparse_dataset(group) - - @classmethod - def __subclasshook__(cls, C): - warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=3) - return issubclass(C, (CSRDataset, CSCDataset)) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 75e7b4ecf..9e036ba44 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -1,74 +1,23 @@ from __future__ import annotations import warnings -from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, get_args import numpy as np import pandas as pd -from numpy import ma from scipy import sparse from .._warnings import ImplicitModificationWarning -from ..compat import ( - AwkArray, - CupyArray, - CupySparseMatrix, - DaskArray, - H5Array, - SpArray, - ZappyArray, - ZarrArray, -) from ..utils import ( ensure_df_homogeneous, join_english, raise_value_error_if_multiindex_columns, ) -from .sparse_dataset import BaseCompressedSparseDataset if TYPE_CHECKING: - from collections.abc import Generator from typing import Any -class ArrayDataStructureType(Enum): - # Memory - Array = (np.ndarray, "np.ndarray") - Masked = (ma.MaskedArray, "numpy.ma.core.MaskedArray") - Sparse = (sparse.spmatrix, "scipy.sparse.spmatrix") - SparseArray = (SpArray, "scipy.sparse.sparray") - AwkArray = (AwkArray, "awkward.Array") - # Backed - HDF5Dataset = (H5Array, "h5py.Dataset") - ZarrArray = (ZarrArray, "zarr.Array") - ZappyArray = (ZappyArray, "zappy.base.ZappyArray") - BackedSparseMatrix = ( - BaseCompressedSparseDataset, - "anndata.experimental.[CSC,CSR]Dataset", - ) - # Distributed - DaskArray = (DaskArray, "dask.array.Array") - CupyArray = (CupyArray, "cupy.ndarray") - CupySparseMatrix = (CupySparseMatrix, "cupyx.scipy.sparse.spmatrix") - - @property - def cls(self): - return self.value[0] - - @property - def qualname(self): - return self.value[1] - - @classmethod - def classes(cls) -> tuple[type, ...]: - return tuple(v.cls for v in cls) - - @classmethod - def qualnames(cls) -> Generator[str, None, None]: - yield from (v.qualname for v in cls) - - def coerce_array( value: Any, *, @@ -77,16 +26,27 @@ def coerce_array( allow_array_like: bool = False, ): """Coerce arrays stored in layers/X, and aligned arrays ({obs,var}{m,p}).""" + from ..typing import ArrayDataStructureType + # If value is a scalar and we allow that, return it if allow_array_like and np.isscalar(value): return value # If value is one of the allowed types, return it - if isinstance(value, ArrayDataStructureType.classes()): + array_data_structure_types = get_args(ArrayDataStructureType) + if isinstance(value, array_data_structure_types): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warnings.warn(msg, ImplicitModificationWarning) value = value.A return value + elif isinstance(value, sparse.spmatrix): + msg = ( + f"AnnData previously had undefined behavior around matrices of type {type(value)}." + "In 0.12, passing in this type will throw an error. Please convert to a supported type." + "Continue using for this minor version at your own risk." + ) + warnings.warn(msg, FutureWarning) + return value if isinstance(value, pd.DataFrame): if allow_df: raise_value_error_if_multiindex_columns(value, name) @@ -100,7 +60,7 @@ def coerce_array( except (ValueError, TypeError) as _e: e = _e # if value isn’t the right type or convertible, raise an error - msg = f"{name} needs to be of one of {join_english(ArrayDataStructureType.qualnames())}, not {type(value)}." + msg = f"{name} needs to be of one of {join_english(map(str, array_data_structure_types))}, not {type(value)}." if e is not None: msg += " (Failed to convert it to an array, see above for details.)" raise ValueError(msg) from e diff --git a/src/anndata/_io/__init__.py b/src/anndata/_io/__init__.py index 9315d3369..8fbd55df3 100644 --- a/src/anndata/_io/__init__.py +++ b/src/anndata/_io/__init__.py @@ -1,40 +1,17 @@ from __future__ import annotations -from .h5ad import read_h5ad, write_h5ad -from .read import ( - read_csv, - read_excel, - read_hdf, - read_loom, - read_mtx, - read_text, - read_umi_tools, - read_zarr, -) -from .write import write_csvs, write_loom +import warnings +__all__: list[str] = [] -def write_zarr(*args, **kw): - from .zarr import write_zarr - return write_zarr(*args, **kw) +def __getattr__(key: str): + from .. import io - -# We use this in test by attribute access -from . import specs # noqa: F401, E402 - -__all__ = [ - "read_csv", - "read_excel", - "read_h5ad", - "read_hdf", - "read_loom", - "read_mtx", - "read_text", - "read_umi_tools", - "read_zarr", - "write_csvs", - "write_h5ad", - "write_loom", - "write_zarr", -] + attr = getattr(io, key) + warnings.warn( + f"Importing {key} from `anndata._io` is deprecated. " + "Please use anndata.io instead.", + FutureWarning, + ) + return attr diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 36429403d..edf4977cc 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -82,14 +82,14 @@ def write_h5ad( f.attrs.setdefault("encoding-version", "0.1.0") if "X" in as_dense and isinstance( - adata.X, (sparse.spmatrix, BaseCompressedSparseDataset) + adata.X, sparse.spmatrix | BaseCompressedSparseDataset ): write_sparse_as_dense(f, "X", adata.X, dataset_kwargs=dataset_kwargs) elif not (adata.isbacked and Path(adata.filename) == Path(filepath)): # If adata.isbacked, X should already be up to date write_elem(f, "X", adata.X, dataset_kwargs=dataset_kwargs) if "raw/X" in as_dense and isinstance( - adata.raw.X, (sparse.spmatrix, BaseCompressedSparseDataset) + adata.raw.X, sparse.spmatrix | BaseCompressedSparseDataset ): write_sparse_as_dense( f, "raw/X", adata.raw.X, dataset_kwargs=dataset_kwargs diff --git a/src/anndata/_io/read.py b/src/anndata/_io/read.py index a50c4b2ef..f22cff351 100644 --- a/src/anndata/_io/read.py +++ b/src/anndata/_io/read.py @@ -21,14 +21,6 @@ if TYPE_CHECKING: from collections.abc import Generator, Iterable, Iterator, Mapping -try: - from .zarr import read_zarr -except ImportError as _e: - e = _e - - def read_zarr(*_, **__): - raise e - def read_csv( filename: PathLike | Iterator[str], @@ -39,7 +31,7 @@ def read_csv( """\ Read `.csv` file. - Same as :func:`~anndata.read_text` but with default delimiter `','`. + Same as :func:`~anndata.io.read_text` but with default delimiter `','`. Parameters ---------- @@ -208,7 +200,7 @@ def read_loom( .. code:: python - pbmc = anndata.read_loom( + pbmc = anndata.io.read_loom( "pbmc.loom", sparse=True, X_name="lognorm", @@ -330,7 +322,7 @@ def read_text( """\ Read `.txt`, `.tab`, `.data` (text) file. - Same as :func:`~anndata.read_csv` but with default delimiter `None`. + Same as :func:`~anndata.io.read_csv` but with default delimiter `None`. Parameters ---------- @@ -345,7 +337,7 @@ def read_text( dtype Numpy data type. """ - if not isinstance(filename, (PathLike, str, bytes)): + if not isinstance(filename, PathLike | str | bytes): return _read_text(filename, delimiter, first_column_names, dtype) filename = Path(filename) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 8a1b31e6b..a34f627e7 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -19,7 +19,7 @@ from collections.abc import Callable, Generator, Mapping, Sequence from typing import Literal, ParamSpec, TypeVar - from ..._core.sparse_dataset import CSCDataset, CSRDataset + from ..._core.sparse_dataset import _CSCDataset, _CSRDataset from ..._types import ArrayStorageType, StorageType from ...compat import DaskArray from .registry import DaskReader @@ -66,7 +66,7 @@ def make_dask_chunk( block_info: BlockInfo | None = None, *, wrap: Callable[[ArrayStorageType], ArrayStorageType] - | Callable[[H5Group | ZarrGroup], CSRDataset | CSCDataset] = lambda g: g, + | Callable[[H5Group | ZarrGroup], _CSRDataset | _CSCDataset] = lambda g: g, ): if block_info is None: msg = "Block info is required" @@ -105,12 +105,16 @@ def read_sparse_as_dask( if chunks is not None: if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") - if chunks[minor_dim] != shape[minor_dim]: + if chunks[minor_dim] not in {shape[minor_dim], -1, None}: raise ValueError( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" ) - stride = chunks[major_dim] + stride = ( + chunks[major_dim] + if chunks[major_dim] not in {None, -1} + else shape[major_dim] + ) shape_minor, shape_major = shape if is_csc else shape[::-1] chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) @@ -120,7 +124,7 @@ def read_sparse_as_dask( ) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix make_chunk = partial( - make_dask_chunk, path_or_group, elem_name, wrap=ad.experimental.sparse_dataset + make_dask_chunk, path_or_group, elem_name, wrap=ad.io.sparse_dataset ) da_mtx = da.map_blocks( make_chunk, @@ -142,7 +146,11 @@ def read_h5_array( shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = ( - chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) + tuple( + c if c not in {None, -1} else s for c, s in zip(chunks, shape, strict=True) + ) + if chunks is not None + else (_DEFAULT_STRIDE,) * len(shape) ) chunk_layout = tuple( diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index f6916cba1..19cd1f66f 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -1,6 +1,8 @@ from __future__ import annotations +import warnings from collections.abc import Mapping +from copy import copy from functools import partial from itertools import product from types import MappingProxyType @@ -10,6 +12,7 @@ import h5py import numpy as np import pandas as pd +from packaging.version import Version from scipy import sparse import anndata as ad @@ -17,7 +20,7 @@ from anndata._core import views from anndata._core.index import _normalize_indices from anndata._core.merge import intersect_keys -from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset +from anndata._core.sparse_dataset import _CSCDataset, _CSRDataset, sparse_dataset from anndata._io.utils import H5PY_V3, check_key from anndata._warnings import OldFormatWarning from anndata.compat import ( @@ -37,21 +40,20 @@ _require_group_write_dataframe, ) +from ..._settings import settings from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial if TYPE_CHECKING: + from collections.abc import Callable from os import PathLike from typing import Any, Literal from numpy import typing as npt + from numpy.typing import NDArray - from anndata._types import ( - ArrayStorageType, - GroupStorageType, - InMemoryArrayOrScalarType, - RWAble, - ) + from anndata._types import ArrayStorageType, GroupStorageType from anndata.compat import SpArray + from anndata.typing import AxisStorable, InMemoryArrayOrScalarType from .registry import Reader, Writer @@ -332,7 +334,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, RWAble]: +def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, AxisStorable]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -341,7 +343,7 @@ def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, RWAble def write_mapping( f: GroupStorageType, k: str, - v: dict[str, RWAble], + v: dict[str, AxisStorable], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -361,7 +363,7 @@ def write_mapping( def write_list( f: GroupStorageType, k: str, - elem: list[RWAble], + elem: list[AxisStorable], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -548,10 +550,12 @@ def write_vlen_string_array_zarr( ): import numcodecs - # Workaround for https://github.com/zarr-developers/numcodecs/issues/514 - # TODO: Warn to upgrade numcodecs if fixed - if not elem.flags.writeable: - elem = elem.copy() + if Version(numcodecs.__version__) < Version("0.13"): + msg = "Old numcodecs version detected. Please update for improved performance and stability." + warnings.warn(msg) + # Workaround for https://github.com/zarr-developers/numcodecs/issues/514 + if hasattr(elem, "flags") and not elem.flags.writeable: + elem = elem.copy() f.create_dataset( k, @@ -684,14 +688,14 @@ def write_sparse_compressed( _REGISTRY.register_write(store_type, cls, spec)(func) -@_REGISTRY.register_write(H5Group, CSRDataset, IOSpec("", "0.1.0")) -@_REGISTRY.register_write(H5Group, CSCDataset, IOSpec("", "0.1.0")) -@_REGISTRY.register_write(ZarrGroup, CSRDataset, IOSpec("", "0.1.0")) -@_REGISTRY.register_write(ZarrGroup, CSCDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(H5Group, _CSRDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(H5Group, _CSCDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(ZarrGroup, _CSRDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(ZarrGroup, _CSCDataset, IOSpec("", "0.1.0")) def write_sparse_dataset( f: GroupStorageType, k: str, - elem: CSCDataset | CSRDataset, + elem: _CSCDataset | _CSRDataset, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -835,6 +839,9 @@ def write_awkward( from anndata.compat import awkward as ak group = f.require_group(k) + if isinstance(v, views.AwkwardArrayView): + # copy to remove the view attributes + v = copy(v) form, length, container = ak.to_buffers(ak.to_packed(v)) group.attrs["length"] = length group.attrs["form"] = form.to_json() @@ -1060,44 +1067,85 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)): @_REGISTRY.register_write( ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0") ) -def write_nullable_integer( +@_REGISTRY.register_write( + H5Group, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") +) +@_REGISTRY.register_write( + ZarrGroup, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") +) +def write_nullable( f: GroupStorageType, k: str, - v: pd.arrays.IntegerArray | pd.arrays.BooleanArray, + v: pd.arrays.IntegerArray | pd.arrays.BooleanArray | pd.arrays.StringArray, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): + if ( + isinstance(v, pd.arrays.StringArray) + and not settings.allow_write_nullable_strings + ): + msg = ( + "`anndata.settings.allow_write_nullable_strings` is False, " + "because writing of `pd.arrays.StringArray` is new " + "and not supported in anndata < 0.11, still use by many people. " + "Opt-in to writing these arrays by toggling the setting to True." + ) + raise RuntimeError(msg) g = f.require_group(k) - if v._mask is not None: - _writer.write_elem(g, "mask", v._mask, dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "values", v._data, dataset_kwargs=dataset_kwargs) + values = ( + v.to_numpy(na_value="") + if isinstance(v, pd.arrays.StringArray) + else v.to_numpy(na_value=0, dtype=v.dtype.numpy_dtype) + ) + _writer.write_elem(g, "values", values, dataset_kwargs=dataset_kwargs) + _writer.write_elem(g, "mask", v.isna(), dataset_kwargs=dataset_kwargs) -@_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) -@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) -def read_nullable_integer( - elem: GroupStorageType, *, _reader: Reader +def _read_nullable( + elem: GroupStorageType, + *, + _reader: Reader, + # BaseMaskedArray + array_type: Callable[ + [NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray + ], ) -> pd.api.extensions.ExtensionArray: - if "mask" in elem: - return pd.arrays.IntegerArray( - _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) - ) - else: - return pd.array(_reader.read_elem(elem["values"])) + return array_type( + _reader.read_elem(elem["values"]), + mask=_reader.read_elem(elem["mask"]), + ) -@_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) -@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) -def read_nullable_boolean( - elem: GroupStorageType, *, _reader: Reader +def _string_array( + values: np.ndarray, mask: np.ndarray ) -> pd.api.extensions.ExtensionArray: - if "mask" in elem: - return pd.arrays.BooleanArray( - _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) - ) - else: - return pd.array(_reader.read_elem(elem["values"])) + """Construct a string array from values and mask.""" + arr = pd.array(values, dtype="string") + arr[mask] = pd.NA + return arr + + +_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))( + read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray) +) +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))( + read_nullable_integer +) + +_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))( + read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray) +) +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))( + read_nullable_boolean +) + +_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))( + read_nullable_string := partial(_read_nullable, array_type=_string_array) +) +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))( + read_nullable_string +) ########### @@ -1137,17 +1185,19 @@ def write_hdf5_scalar( f.create_dataset(key, data=np.array(value), **dataset_kwargs) -# fmt: off for numeric_scalar_type in [ - bool, np.bool_, - np.uint8, np.uint16, np.uint32, np.uint64, - int, np.int8, np.int16, np.int32, np.int64, - float, *np.floating.__subclasses__(), + *(bool, np.bool_), + *(np.uint8, np.uint16, np.uint32, np.uint64), + *(int, np.int8, np.int16, np.int32, np.int64), + *(float, *np.floating.__subclasses__()), *np.complexfloating.__subclasses__(), ]: - _REGISTRY.register_write(H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_hdf5_scalar) - _REGISTRY.register_write(ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_scalar) -# fmt: on + _REGISTRY.register_write( + H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") + )(write_hdf5_scalar) + _REGISTRY.register_write( + ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") + )(write_scalar) _REGISTRY.register_write(ZarrGroup, str, IOSpec("string", "0.2.0"))(write_scalar) _REGISTRY.register_write(ZarrGroup, np.str_, IOSpec("string", "0.2.0"))(write_scalar) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 9a46ba9aa..ca13f8e59 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -17,13 +17,13 @@ from anndata._types import ( GroupStorageType, - InMemoryElem, ReadCallback, StorageType, Write, WriteCallback, _WriteInternal, ) + from anndata.typing import RWAble T = TypeVar("T") W = TypeVar("W", bound=_WriteInternal) @@ -270,7 +270,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - ) -> InMemoryElem: + ) -> RWAble: """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) @@ -323,7 +323,7 @@ def write_elem( self, store: GroupStorageType, k: str, - elem: InMemoryElem, + elem: RWAble, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), modifiers: frozenset[str] = frozenset(), @@ -363,7 +363,7 @@ def write_elem( ) -def read_elem(elem: StorageType) -> InMemoryElem: +def read_elem(elem: StorageType) -> RWAble: """ Read an element from a store. @@ -395,10 +395,68 @@ def read_elem_as_dask( chunks, optional length `n`, the same `n` as the size of the underlying array. Note that the minor axis dimension must match the shape for sparse. + Defaults to `(1000, adata.shape[1])` for CSR sparse, + `(adata.shape[0], 1000)` for CSC sparse, + and the on-disk chunking otherwise for dense. + Can use `-1` or `None` to indicate use of the size of the corresponding dimension. Returns ------- DaskArray + + Examples + -------- + + Setting up our example: + + >>> from scanpy.datasets import pbmc3k + >>> import tempfile + >>> import anndata as ad + >>> import zarr + + >>> tmp_path = tempfile.gettempdir() + >>> zarr_path = tmp_path + "/adata.zarr" + + >>> adata = pbmc3k() + >>> adata.layers["dense"] = adata.X.toarray() + >>> adata.write_zarr(zarr_path) + + Reading a sparse matrix from a zarr store lazily, with custom chunk size and default: + + >>> g = zarr.open(zarr_path) + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"]) + >>> adata.X + dask.array + >>> adata.X = ad.experimental.read_elem_as_dask( + ... g["X"], chunks=(500, adata.shape[1]) + ... ) + >>> adata.X + dask.array + + Reading a dense matrix from a zarr store lazily: + + >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) + >>> adata.layers["dense"] + dask.array + + Making a new anndata object from on-disk, with custom chunks: + + >>> adata = ad.AnnData( + ... obs=ad.io.read_elem(g["obs"]), + ... var=ad.io.read_elem(g["var"]), + ... uns=ad.io.read_elem(g["uns"]), + ... obsm=ad.io.read_elem(g["obsm"]), + ... varm=ad.io.read_elem(g["varm"]), + ... ) + >>> adata.X = ad.experimental.read_elem_as_dask( + ... g["X"], chunks=(500, adata.shape[1]) + ... ) + >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) + + We also support using -1 and None as a chunk size to signify the reading the whole axis: + + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1)) + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, None)) """ return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) @@ -406,7 +464,7 @@ def read_elem_as_dask( def write_elem( store: GroupStorageType, k: str, - elem: InMemoryElem, + elem: RWAble, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: diff --git a/src/anndata/_io/utils.py b/src/anndata/_io/utils.py index ee7aa23d0..f8bdb01c7 100644 --- a/src/anndata/_io/utils.py +++ b/src/anndata/_io/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import wraps +from itertools import pairwise from typing import TYPE_CHECKING, cast from warnings import warn @@ -8,16 +9,16 @@ from packaging.version import Version from .._core.sparse_dataset import BaseCompressedSparseDataset -from ..compat import add_note, pairwise +from ..compat import add_note if TYPE_CHECKING: from collections.abc import Callable - from typing import Literal, Union + from typing import Literal from .._types import StorageType from ..compat import H5Group, ZarrGroup - Storage = Union[StorageType, BaseCompressedSparseDataset] + Storage = StorageType | BaseCompressedSparseDataset # For allowing h5py v3 # https://github.com/scverse/anndata/issues/442 diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 4cab3ea8d..2564738ad 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -103,6 +103,7 @@ def callback(func, elem_name: str, elem, iospec): @report_read_key_on_error def read_dataset(dataset: zarr.Array): + """Legacy method for reading datasets without encoding_type.""" value = dataset[...] if not hasattr(value, "dtype"): return value diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py index 285415a74..f67633c08 100644 --- a/src/anndata/_settings.py +++ b/src/anndata/_settings.py @@ -14,6 +14,7 @@ from types import GenericAlias from typing import TYPE_CHECKING, Generic, NamedTuple, TypeVar, cast +from anndata.compat import CAN_USE_SPARSE_ARRAY from anndata.compat.exceptiongroups import add_note if TYPE_CHECKING: @@ -396,36 +397,54 @@ def __doc__(self): ################################################################################## -categories_option = "should_remove_unused_categories" -categories_default_value = True -categories_description = ( - "Whether or not to remove unused categories with :class:`~pandas.Categorical`." -) - -uniqueness_option = "should_check_uniqueness" -uniqueness_default_value = True -uniqueness_description = "Whether or not to check uniqueness of the `obs` indices on `__init__` of :class:`~anndata.AnnData`." - - -def validate_bool(val) -> None: +def validate_bool(val: Any) -> None: if not isinstance(val, bool): msg = f"{val} not valid boolean" raise TypeError(msg) settings.register( - categories_option, - categories_default_value, - categories_description, - validate_bool, + "remove_unused_categories", + default_value=True, + description="Whether or not to remove unused categories with :class:`~pandas.Categorical`.", + validate=validate_bool, get_from_env=check_and_get_bool, ) settings.register( - uniqueness_option, - uniqueness_default_value, - uniqueness_description, - validate_bool, + "check_uniqueness", + default_value=True, + description=( + "Whether or not to check uniqueness of the `obs` indices on `__init__` of :class:`~anndata.AnnData`." + ), + validate=validate_bool, + get_from_env=check_and_get_bool, +) + +settings.register( + "allow_write_nullable_strings", + default_value=False, + description="Whether or not to allow writing of `pd.arrays.StringArray`.", + validate=validate_bool, + get_from_env=check_and_get_bool, +) + + +def validate_sparse_settings(val: Any) -> None: + validate_bool(val) + if not CAN_USE_SPARSE_ARRAY and cast(bool, val): + msg = ( + "scipy.sparse.cs{r,c}array is not available in current scipy version. " + "Falling back to scipy.sparse.cs{r,c}_matrix for reading." + ) + raise ValueError(msg) + + +settings.register( + "use_sparse_array_on_read", + default_value=False, + description="Whether or not to use :class:`scipy.sparse.sparray` as the default class when reading in data", + validate=validate_sparse_settings, get_from_env=check_and_get_bool, ) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 3549152f5..2d9eb9980 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,89 +4,47 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Protocol, TypeVar, Union +from typing import TYPE_CHECKING, Protocol, TypeVar -import numpy as np -import pandas as pd -from numpy.typing import NDArray -from scipy import sparse - -from anndata._core.anndata import AnnData - -from ._core.sparse_dataset import BaseCompressedSparseDataset from .compat import ( - AwkArray, - CupyArray, - CupySparseMatrix, - DaskArray, H5Array, H5Group, - SpArray, - ZappyArray, ZarrArray, ZarrGroup, ) +from .typing import RWAble if TYPE_CHECKING: from collections.abc import Mapping from typing import Any, TypeAlias - from anndata._io.specs.registry import DaskReader - - from ._io.specs.registry import IOSpec, Reader, Writer + from ._io.specs.registry import DaskReader, IOSpec, Reader, Writer + from .compat import DaskArray __all__ = [ "ArrayStorageType", "GroupStorageType", "StorageType", + "_ReadInternal", + "_ReadDaskInternal", + "_WriteInternal", ] -InMemoryArrayOrScalarType: TypeAlias = Union[ - NDArray, - np.ma.MaskedArray, - sparse.spmatrix, - SpArray, - H5Array, - ZarrArray, - ZappyArray, - BaseCompressedSparseDataset, - DaskArray, - CupyArray, - CupySparseMatrix, - AwkArray, - pd.DataFrame, - np.number, - str, -] -RWAble: TypeAlias = Union[ - InMemoryArrayOrScalarType, dict[str, "RWAble"], list["RWAble"] -] # noqa: TCH010 -InMemoryElem: TypeAlias = Union[ - RWAble, - AnnData, - pd.Categorical, - pd.api.extensions.ExtensionArray, -] - -ArrayStorageType: TypeAlias = Union[ZarrArray, H5Array] -GroupStorageType: TypeAlias = Union[ZarrGroup, H5Group] -StorageType: TypeAlias = Union[ArrayStorageType, GroupStorageType] +ArrayStorageType: TypeAlias = ZarrArray | H5Array +GroupStorageType: TypeAlias = ZarrGroup | H5Group +StorageType: TypeAlias = ArrayStorageType | GroupStorageType # NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py! -ContravariantInMemoryType = TypeVar( - "ContravariantInMemoryType", bound="InMemoryElem", contravariant=True -) -CovariantInMemoryType = TypeVar( - "CovariantInMemoryType", bound="InMemoryElem", covariant=True -) -InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryElem") +ContravariantRWAble = TypeVar("ContravariantRWAble", bound=RWAble, contravariant=True) +CovariantRWAble = TypeVar("CovariantRWAble", bound=RWAble, covariant=True) +InvariantRWAble = TypeVar("InvariantRWAble", bound=RWAble) SCo = TypeVar("SCo", covariant=True, bound=StorageType) SCon = TypeVar("SCon", contravariant=True, bound=StorageType) -class _ReadInternal(Protocol[SCon, CovariantInMemoryType]): - def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantInMemoryType: ... +class _ReadInternal(Protocol[SCon, CovariantRWAble]): + def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantRWAble: ... class _ReadDaskInternal(Protocol[SCon]): @@ -95,8 +53,8 @@ def __call__( ) -> DaskArray: ... -class Read(Protocol[SCon, CovariantInMemoryType]): - def __call__(self, elem: SCon) -> CovariantInMemoryType: +class Read(Protocol[SCon, CovariantRWAble]): + def __call__(self, elem: SCon) -> CovariantRWAble: """Low-level reading function for an element. Parameters @@ -129,24 +87,24 @@ def __call__( ... -class _WriteInternal(Protocol[ContravariantInMemoryType]): +class _WriteInternal(Protocol[ContravariantRWAble]): def __call__( self, f: StorageType, k: str, - v: ContravariantInMemoryType, + v: ContravariantRWAble, *, _writer: Writer, dataset_kwargs: Mapping[str, Any], ) -> None: ... -class Write(Protocol[ContravariantInMemoryType]): +class Write(Protocol[ContravariantRWAble]): def __call__( self, f: StorageType, k: str, - v: ContravariantInMemoryType, + v: ContravariantRWAble, *, dataset_kwargs: Mapping[str, Any], ) -> None: @@ -166,23 +124,23 @@ def __call__( ... -class ReadCallback(Protocol[SCo, InvariantInMemoryType]): +class ReadCallback(Protocol[SCo, InvariantRWAble]): def __call__( self, /, - read_func: Read[SCo, InvariantInMemoryType], + read_func: Read[SCo, InvariantRWAble], elem_name: str, elem: StorageType, *, iospec: IOSpec, - ) -> InvariantInMemoryType: + ) -> InvariantRWAble: """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. Params ------ read_func - :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. + :func:`anndata.io.read_elem` function to call to read the current element given the ``iospec``. elem_name The key to read in from the group. elem @@ -197,14 +155,14 @@ def __call__( ... -class WriteCallback(Protocol[InvariantInMemoryType]): +class WriteCallback(Protocol[InvariantRWAble]): def __call__( self, /, - write_func: Write[InvariantInMemoryType], + write_func: Write[InvariantRWAble], store: StorageType, elem_name: str, - elem: InvariantInMemoryType, + elem: InvariantRWAble, *, iospec: IOSpec, dataset_kwargs: Mapping[str, Any], @@ -215,7 +173,7 @@ def __call__( Params ------ write_func - :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. + :func:`anndata.io.write_elem` function to call to read the current element given the ``iospec``. store The store to which `elem` should be written. elem_name diff --git a/src/anndata/abc.py b/src/anndata/abc.py new file mode 100644 index 000000000..df8c8a6e8 --- /dev/null +++ b/src/anndata/abc.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import ClassVar, Literal + + import numpy as np + from scipy.sparse import csc_matrix, csr_matrix + + from .compat import Index, SpArray + + +__all__ = ["CSRDataset", "CSCDataset"] + + +class _AbstractCSDataset(ABC): + """Base for the public API for CSRDataset/CSCDataset.""" + + format: ClassVar[Literal["csr", "csc"]] + """The format of the sparse matrix.""" + + shape: tuple[int, int] + """Shape of the matrix.""" + + dtype: np.dtype + """The :class:`numpy.dtype` of the `data` attribute of the sparse matrix.""" + + backend: Literal["zarr", "hdf5"] + """Which file type is used on-disk.""" + + @abstractmethod + def __getitem__(self, index: Index) -> float | csr_matrix | csc_matrix | SpArray: + """Load a slice or an element from the sparse dataset into memory. + + Parameters + ---------- + index + Index to load. + + Returns + ------- + The desired data read off disk. + """ + + @abstractmethod + def to_memory(self) -> csr_matrix | csc_matrix | SpArray: + """Load the sparse dataset into memory. + + Returns + ------- + The in-memory representation of the sparse dataset. + """ + + +_sparse_dataset_doc = """\ +On disk {format} sparse matrix. + +Analogous to :class:`h5py.Dataset` or :class:`zarr.core.Array`, but for sparse matrices. +""" + + +class CSRDataset(_AbstractCSDataset, ABC): + __doc__ = _sparse_dataset_doc.format(format="CSR") + format = "csr" + + +class CSCDataset(_AbstractCSDataset, ABC): + __doc__ = _sparse_dataset_doc.format(format="CSC") + format = "csc" diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index 026f6cb4c..255ffa548 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -7,9 +7,11 @@ from contextlib import AbstractContextManager from dataclasses import dataclass, field from functools import singledispatch, wraps +from importlib.util import find_spec from inspect import Parameter, signature from pathlib import Path -from typing import TYPE_CHECKING, TypeVar, Union +from types import EllipsisType +from typing import TYPE_CHECKING, TypeVar from warnings import warn import h5py @@ -45,8 +47,18 @@ class Empty: pass -Index1D = Union[slice, int, str, np.int64, np.ndarray] -Index = Union[Index1D, tuple[Index1D, Index1D], scipy.sparse.spmatrix, SpArray] +Index1D = slice | int | str | np.int64 | np.ndarray +IndexRest = Index1D | EllipsisType +Index = ( + IndexRest + | tuple[Index1D, IndexRest] + | tuple[IndexRest, Index1D] + | tuple[Index1D, Index1D, EllipsisType] + | tuple[EllipsisType, Index1D, Index1D] + | tuple[Index1D, EllipsisType, Index1D] + | scipy.sparse.spmatrix + | SpArray +) H5Group = h5py.Group H5Array = h5py.Dataset H5File = h5py.File @@ -74,26 +86,14 @@ def __exit__(self, *_exc_info) -> None: os.chdir(self._old_cwd.pop()) -if sys.version_info >= (3, 10): - from itertools import pairwise -else: - - def pairwise(iterable): - from itertools import tee - - a, b = tee(iterable) - next(b, None) - return zip(a, b) - - ############################# # Optional deps ############################# -try: +if find_spec("zarr") or TYPE_CHECKING: from zarr.core import Array as ZarrArray from zarr.hierarchy import Group as ZarrGroup -except ImportError: +else: class ZarrArray: @staticmethod @@ -106,12 +106,10 @@ def __repr__(): return "mock zarr.core.Group" -try: - import awkward - - AwkArray = awkward.Array - -except ImportError: +if find_spec("awkward") or TYPE_CHECKING: + import awkward # noqa: F401 + from awkward import Array as AwkArray +else: class AwkArray: @staticmethod @@ -119,9 +117,9 @@ def __repr__(): return "mock awkward.highlevel.Array" -try: +if find_spec("zappy") or TYPE_CHECKING: from zappy.base import ZappyArray -except ImportError: +else: class ZappyArray: @staticmethod @@ -129,9 +127,12 @@ def __repr__(): return "mock zappy.base.ZappyArray" -try: +if TYPE_CHECKING: + # type checkers are confused and can only see …core.Array + from dask.array.core import Array as DaskArray +elif find_spec("dask"): from dask.array import Array as DaskArray -except ImportError: +else: class DaskArray: @staticmethod @@ -139,27 +140,29 @@ def __repr__(): return "mock dask.array.core.Array" -try: +# https://github.com/scverse/anndata/issues/1749 +def is_cupy_importable() -> bool: + try: + import cupy # noqa: F401 + except ImportError: + return False + return True + + +if is_cupy_importable() or TYPE_CHECKING: from cupy import ndarray as CupyArray - from cupyx.scipy.sparse import ( - csc_matrix as CupyCSCMatrix, - ) - from cupyx.scipy.sparse import ( - csr_matrix as CupyCSRMatrix, - ) - from cupyx.scipy.sparse import ( - spmatrix as CupySparseMatrix, - ) + from cupyx.scipy.sparse import csc_matrix as CupyCSCMatrix + from cupyx.scipy.sparse import csr_matrix as CupyCSRMatrix + from cupyx.scipy.sparse import spmatrix as CupySparseMatrix try: import dask.array as da - - da.register_chunk_type(CupyCSRMatrix) - da.register_chunk_type(CupyCSCMatrix) except ImportError: pass - -except ImportError: + else: + da.register_chunk_type(CupyCSRMatrix) + da.register_chunk_type(CupyCSCMatrix) +else: class CupySparseMatrix: @staticmethod @@ -293,7 +296,7 @@ def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray: return value.astype(new_dtype) -Group_T = TypeVar("Group_T", bound=Union[ZarrGroup, h5py.Group]) +Group_T = TypeVar("Group_T", bound=ZarrGroup | h5py.Group) # TODO: This is a workaround for https://github.com/scverse/anndata/issues/874 @@ -324,7 +327,7 @@ def _clean_uns(adata: AnnData): # noqa: F821 continue name = cats_name.replace("_categories", "") # fix categories with a single category - if isinstance(cats, (str, int)): + if isinstance(cats, str | int): cats = [cats] for ann in [adata.obs, adata.var]: if name not in ann: @@ -349,7 +352,7 @@ def _move_adj_mtx(d): for k in ("distances", "connectivities"): if ( (k in n) - and isinstance(n[k], (scipy.sparse.spmatrix, np.ndarray)) + and isinstance(n[k], scipy.sparse.spmatrix | np.ndarray) and len(n[k].shape) == 2 ): warn( diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index bf21ed6e7..90e83a87e 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -1,38 +1,52 @@ from __future__ import annotations -from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset -from anndata._io.specs import IOSpec, read_elem, read_elem_as_dask, write_elem +from types import MappingProxyType +from typing import TYPE_CHECKING -from .._types import InMemoryElem as _InMemoryElem +from .._io.specs import IOSpec, read_elem_as_dask from .._types import Read, ReadCallback, StorageType, Write, WriteCallback -from .._types import RWAble as _RWAble +from ..utils import module_get_attr_redirect from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader -# Sphinx can’t find data docstrings when objects are re-exported -InMemoryElem = _InMemoryElem -"""An in-memory element that can be read and written, including an :class:`anndata.AnnData` objects.""" -RWAble = _RWAble -"""A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" +if TYPE_CHECKING: + from typing import Any + + +# Map old name in `anndata.experimental` to new name in `anndata` +_DEPRECATED = MappingProxyType( + dict( + (kv if isinstance(kv, tuple) else (kv, kv)) + for kv in ( + ("CSRDataset", "abc.CSRDataset"), + ("CSCDataset", "abc.CSCDataset"), + ("sparse_dataset", "io.sparse_dataset"), + ("read_elem", "io.read_elem"), + ("write_elem", "io.write_elem"), + ("RWAble", "typing.AxisStorable"), + ("InMemoryElem", "typing.RWAble"), + ) + ) +) + + +def __getattr__(attr_name: str) -> Any: + return module_get_attr_redirect( + attr_name, deprecated_mapping=_DEPRECATED, old_module_path="experimental" + ) + __all__ = [ "AnnCollection", "AnnLoader", - "read_elem", - "write_elem", "read_elem_as_dask", "read_dispatched", "write_dispatched", "IOSpec", "concat_on_disk", - "sparse_dataset", - "CSRDataset", - "CSCDataset", - "InMemoryElem", "Read", - "RWAble", "Write", "ReadCallback", "WriteCallback", diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index 20b47baeb..53f94c453 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -9,17 +9,17 @@ from anndata._types import ( GroupStorageType, - InMemoryElem, ReadCallback, StorageType, WriteCallback, ) + from anndata.typing import RWAble def read_dispatched( elem: StorageType, callback: ReadCallback, -) -> InMemoryElem: +) -> RWAble: """ Read elem, calling the callback at each sub-element. @@ -45,7 +45,7 @@ def read_dispatched( def write_dispatched( store: GroupStorageType, key: str, - elem: InMemoryElem, + elem: RWAble, callback: WriteCallback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index 9690420ec..21a678e2c 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -352,7 +352,7 @@ def _write_concat_sequence( ) write_elem(output_group, output_path, df) elif all( - isinstance(a, (pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray)) + isinstance(a, pd.DataFrame | BaseCompressedSparseDataset | H5Array | ZarrArray) for a in arrays ): _write_concat_arrays( diff --git a/src/anndata/experimental/multi_files/_anncollection.py b/src/anndata/experimental/multi_files/_anncollection.py index 31b27c879..c5f427f6d 100644 --- a/src/anndata/experimental/multi_files/_anncollection.py +++ b/src/anndata/experimental/multi_files/_anncollection.py @@ -3,7 +3,7 @@ import warnings from collections.abc import Callable, Mapping from functools import reduce -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -584,7 +584,7 @@ def attrs_keys(self): DictCallable = dict[str, Callable] -ConvertType = Union[Callable, dict[str, Union[Callable, DictCallable]]] +ConvertType = Callable | dict[str, Callable | DictCallable] class AnnCollection(_ConcatViewMixin, _IterateViewMixin): diff --git a/src/anndata/experimental/pytorch/_annloader.py b/src/anndata/experimental/pytorch/_annloader.py index 8cc883921..cebbe1b5d 100644 --- a/src/anndata/experimental/pytorch/_annloader.py +++ b/src/anndata/experimental/pytorch/_annloader.py @@ -2,6 +2,7 @@ from copy import copy from functools import partial +from importlib.util import find_spec from math import ceil from typing import TYPE_CHECKING @@ -14,10 +15,10 @@ if TYPE_CHECKING: from collections.abc import Sequence -try: +if find_spec("torch") or TYPE_CHECKING: import torch from torch.utils.data import BatchSampler, DataLoader, Sampler -except ImportError: +else: Sampler, BatchSampler, DataLoader = object, object, object diff --git a/src/anndata/io.py b/src/anndata/io.py new file mode 100644 index 000000000..5f9ba323c --- /dev/null +++ b/src/anndata/io.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from importlib.util import find_spec +from typing import TYPE_CHECKING + +from ._core.sparse_dataset import sparse_dataset +from ._io.h5ad import read_h5ad, write_h5ad +from ._io.read import ( + read_csv, + read_excel, + read_hdf, + read_loom, + read_mtx, + read_text, + read_umi_tools, +) +from ._io.specs import read_elem, write_elem +from ._io.write import write_csvs, write_loom + +if find_spec("zarr") or TYPE_CHECKING: + from ._io.zarr import read_zarr, write_zarr +else: # pragma: no cover + + def read_zarr(*args, **kw): + raise ImportError("zarr is not installed") + + def write_zarr(*args, **kw): + raise ImportError("zarr is not installed") + + +__all__ = [ + "read_csv", + "read_excel", + "read_h5ad", + "read_hdf", + "read_loom", + "read_mtx", + "read_text", + "read_umi_tools", + "read_zarr", + "write_csvs", + "write_h5ad", + "write_loom", + "write_zarr", + "write_elem", + "read_elem", + "sparse_dataset", +] diff --git a/src/anndata/logging.py b/src/anndata/logging.py index a2a890c51..1a0f2e11d 100644 --- a/src/anndata/logging.py +++ b/src/anndata/logging.py @@ -31,7 +31,7 @@ def get_memory_usage(): meminfo = process.get_memory_info() mem = meminfo[0] / 2**30 # output in GB mem_diff = mem - global _previous_memory_usage + global _previous_memory_usage # noqa: PLW0603 if _previous_memory_usage is not None: mem_diff = mem - _previous_memory_usage _previous_memory_usage = mem diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 185808b8d..6ed637ed8 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -4,9 +4,11 @@ import random import re import warnings +from collections import Counter from collections.abc import Mapping from contextlib import contextmanager from functools import partial, singledispatch, wraps +from importlib.util import find_spec from string import ascii_letters from typing import TYPE_CHECKING @@ -35,8 +37,20 @@ from anndata.utils import asarray if TYPE_CHECKING: - from collections.abc import Collection - from typing import Literal + from collections.abc import Callable, Collection, Iterable + from typing import Literal, TypeGuard, TypeVar + + DT = TypeVar("DT") + + +try: + from pandas.core.arrays.integer import IntegerDtype +except ImportError: + IntegerDtype = ( + *(pd.Int8Dtype, pd.Int16Dtype, pd.Int32Dtype, pd.Int64Dtype), + *(pd.UInt8Dtype, pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype), + ) + # Give this to gen_adata when dask array support is expected. GEN_ADATA_DASK_ARGS = dict( @@ -45,30 +59,43 @@ np.ndarray, pd.DataFrame, DaskArray, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), ), varm_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), ), layers_types=( sparse.csr_matrix, np.ndarray, pd.DataFrame, DaskArray, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), ), ) -if CAN_USE_SPARSE_ARRAY: - GEN_ADATA_DASK_ARGS["obsm_types"] = GEN_ADATA_DASK_ARGS["obsm_types"] + ( - sparse.csr_array, - ) - GEN_ADATA_DASK_ARGS["varm_types"] = GEN_ADATA_DASK_ARGS["varm_types"] + ( - sparse.csr_array, - ) - GEN_ADATA_DASK_ARGS["layers_types"] = GEN_ADATA_DASK_ARGS["layers_types"] + ( - sparse.csr_array, - ) + + +DEFAULT_KEY_TYPES = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + *((sparse.csr_array,) if CAN_USE_SPARSE_ARRAY else ()), +) + + +DEFAULT_COL_TYPES = ( + pd.CategoricalDtype(ordered=False), + pd.CategoricalDtype(ordered=True), + np.int64, + np.float64, + np.uint8, + np.bool_, + pd.BooleanDtype, + pd.Int32Dtype, +) def gen_vstr_recarray(m, n, dtype=None): @@ -82,30 +109,82 @@ def gen_vstr_recarray(m, n, dtype=None): ) -def gen_typed_df(n, index=None): - # TODO: Think about allowing index to be passed for n - letters = np.fromiter(iter(ascii_letters), "U1") - if n > len(letters): - letters = letters[: n // 2] # Make sure categories are repeated - return pd.DataFrame( - { - "cat": pd.Categorical(np.random.choice(letters, n)), - "cat_ordered": pd.Categorical(np.random.choice(letters, n), ordered=True), - "int64": np.random.randint(-50, 50, n), - "float64": np.random.random(n), - "uint8": np.random.randint(255, size=n, dtype="uint8"), - "bool": np.random.randint(0, 2, size=n, dtype=bool), - "nullable-bool": pd.arrays.BooleanArray( +def issubdtype( + a: np.dtype | pd.api.extensions.ExtensionDtype | type, + b: type[DT] | tuple[type[DT], ...], +) -> TypeGuard[DT]: + if isinstance(b, tuple): + return any(issubdtype(a, t) for t in b) + if isinstance(a, type) and issubclass(a, pd.api.extensions.ExtensionDtype): + return issubclass(a, b) + if isinstance(a, pd.api.extensions.ExtensionDtype): + return isinstance(a, b) + try: + return np.issubdtype(a, b) + except TypeError: # pragma: no cover + pytest.fail(f"issubdtype can’t handle everything yet: {a} {b}") + + +def gen_random_column( + n: int, dtype: np.dtype | pd.api.extensions.ExtensionDtype +) -> tuple[str, np.ndarray | pd.api.extensions.ExtensionArray]: + if issubdtype(dtype, pd.CategoricalDtype): + # TODO: Think about allowing index to be passed for n + letters = np.fromiter(iter(ascii_letters), "U1") + if n > len(letters): + letters = letters[: n // 2] # Make sure categories are repeated + key = "cat" if dtype.ordered else "cat_unordered" + return key, pd.Categorical(np.random.choice(letters, n), dtype=dtype) + if issubdtype(dtype, pd.BooleanDtype): + return ( + "nullable-bool", + pd.arrays.BooleanArray( np.random.randint(0, 2, size=n, dtype=bool), mask=np.random.randint(0, 2, size=n, dtype=bool), ), - "nullable-int": pd.arrays.IntegerArray( + ) + if issubdtype(dtype, IntegerDtype): + return ( + "nullable-int", + pd.arrays.IntegerArray( np.random.randint(0, 1000, size=n, dtype=np.int32), mask=np.random.randint(0, 2, size=n, dtype=bool), ), - }, - index=index, - ) + ) + if issubdtype(dtype, pd.StringDtype): + letters = np.fromiter(iter(ascii_letters), "U1") + array = np.array(np.random.choice(letters, n), dtype=dtype) + array[np.random.randint(0, 2, size=n, dtype=bool)] = pd.NA + return "string", array + # if issubdtype(dtype, pd.DatetimeTZDtype): + # return "datetime", pd.to_datetime(np.random.randint(0, 1000, size=n)) + if issubdtype(dtype, np.bool_): + return "bool", np.random.randint(0, 2, size=n, dtype=dtype) + + if not issubdtype(dtype, np.number): # pragma: no cover + pytest.fail(f"Unexpected dtype: {dtype}") + + n_bits = 8 * (dtype().itemsize if isinstance(dtype, type) else dtype.itemsize) + + if issubdtype(dtype, np.unsignedinteger): + return f"uint{n_bits}", np.random.randint(0, 255, n, dtype=dtype) + if issubdtype(dtype, np.signedinteger): + return f"int{n_bits}", np.random.randint(-50, 50, n, dtype=dtype) + if issubdtype(dtype, np.floating): + return f"float{n_bits}", np.random.random(n).astype(dtype) + + pytest.fail(f"Unexpected numeric dtype: {dtype}") # pragma: no cover + + +def gen_typed_df( + n: int, + index: pd.Index[str] | None = None, + dtypes: Collection[np.dtype | pd.api.extensions.ExtensionDtype] = DEFAULT_COL_TYPES, +): + columns = [gen_random_column(n, dtype) for dtype in dtypes] + col_names = [n for n, _ in columns] + assert len(col_names) == len(set(col_names)), "Duplicate column names generated!" + return pd.DataFrame(dict(columns), index=index) def _gen_awkward_inner(shape, rng, dtype): @@ -182,20 +261,11 @@ def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: return df -default_key_types = ( - sparse.csr_matrix, - np.ndarray, - pd.DataFrame, -) -if CAN_USE_SPARSE_ARRAY: - default_key_types = default_key_types + (sparse.csr_array,) - - def maybe_add_sparse_array( mapping: Mapping, types: Collection[type], format: Literal["csr", "csc"], - random_state: int, + random_state: np.random.Generator, shape: tuple[int, int], ): if CAN_USE_SPARSE_ARRAY: @@ -209,15 +279,20 @@ def maybe_add_sparse_array( # TODO: Use hypothesis for this? def gen_adata( shape: tuple[int, int], - X_type=sparse.csr_matrix, - X_dtype=np.float32, - # obs_dtypes, - # var_dtypes, - obsm_types: Collection[type] = default_key_types + (AwkArray,), - varm_types: Collection[type] = default_key_types + (AwkArray,), - layers_types: Collection[type] = default_key_types, - random_state=None, - sparse_fmt: str = "csr", + X_type: Callable[[np.ndarray], object] = sparse.csr_matrix, + *, + X_dtype: np.dtype = np.float32, + obs_dtypes: Collection[ + np.dtype | pd.api.extensions.ExtensionDtype + ] = DEFAULT_COL_TYPES, + var_dtypes: Collection[ + np.dtype | pd.api.extensions.ExtensionDtype + ] = DEFAULT_COL_TYPES, + obsm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,), + varm_types: Collection[type] = DEFAULT_KEY_TYPES + (AwkArray,), + layers_types: Collection[type] = DEFAULT_KEY_TYPES, + random_state: np.random.Generator | None = None, + sparse_fmt: Literal["csr", "csc"] = "csr", ) -> AnnData: """\ Helper function to generate a random AnnData for testing purposes. @@ -253,8 +328,8 @@ def gen_adata( M, N = shape obs_names = pd.Index(f"cell{i}" for i in range(shape[0])) var_names = pd.Index(f"gene{i}" for i in range(shape[1])) - obs = gen_typed_df(M, obs_names) - var = gen_typed_df(N, var_names) + obs = gen_typed_df(M, obs_names, dtypes=obs_dtypes) + var = gen_typed_df(N, var_names, dtypes=var_dtypes) # For #147 obs.rename(columns=dict(cat="obs_cat"), inplace=True) var.rename(columns=dict(cat="var_cat"), inplace=True) @@ -267,7 +342,7 @@ def gen_adata( obsm = dict( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format=sparse_fmt, random_state=random_state), - df=gen_typed_df(M, obs_names), + df=gen_typed_df(M, obs_names, dtypes=obs_dtypes), awk_2d_ragged=gen_awkward((M, None)), da=da.random.random((M, 50)), ) @@ -282,7 +357,7 @@ def gen_adata( varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format=sparse_fmt, random_state=random_state), - df=gen_typed_df(N, var_names), + df=gen_typed_df(N, var_names, dtypes=var_dtypes), awk_2d_ragged=gen_awkward((N, None)), da=da.random.random((N, 50)), ) @@ -964,46 +1039,49 @@ def shares_memory_sparse(x, y): ), ] -try: - import zarr +if find_spec("zarr") or TYPE_CHECKING: + from zarr import DirectoryStore +else: - class AccessTrackingStore(zarr.DirectoryStore): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._access_count = {} - self._accessed_keys = {} + class DirectoryStore: + def __init__(self, *_args, **_kwargs) -> None: + cls_name = type(self).__name__ + msg = f"zarr must be imported to create a {cls_name} instance." + raise ImportError(msg) - def __getitem__(self, key): - for tracked in self._access_count: - if tracked in key: - self._access_count[tracked] += 1 - self._accessed_keys[tracked] += [key] - return super().__getitem__(key) - def get_access_count(self, key): - return self._access_count[key] +class AccessTrackingStore(DirectoryStore): + _access_count: Counter[str] + _accessed_keys: dict[str, list[str]] - def get_accessed_keys(self, key): - return self._accessed_keys[key] + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._access_count = Counter() + self._accessed_keys = {} - def initialize_key_trackers(self, keys_to_track): - for k in keys_to_track: - self._access_count[k] = 0 - self._accessed_keys[k] = [] + def __getitem__(self, key: str) -> object: + for tracked in self._access_count: + if tracked in key: + self._access_count[tracked] += 1 + self._accessed_keys[tracked] += [key] + return super().__getitem__(key) - def reset_key_trackers(self): - self.initialize_key_trackers(self._access_count.keys()) + def get_access_count(self, key: str) -> int: + return self._access_count[key] -except ImportError: + def get_accessed_keys(self, key: str) -> list[str]: + return self._accessed_keys[key] - class AccessTrackingStore: - def __init__(self, *_args, **_kwargs) -> None: - raise ImportError( - "zarr must be imported to create an `AccessTrackingStore` instance." - ) + def initialize_key_trackers(self, keys_to_track: Iterable[str]) -> None: + for k in keys_to_track: + self._access_count[k] = 0 + self._accessed_keys[k] = [] + + def reset_key_trackers(self) -> None: + self.initialize_key_trackers(self._access_count.keys()) -def get_multiindex_columns_df(shape): +def get_multiindex_columns_df(shape: tuple[int, int]) -> pd.DataFrame: return pd.DataFrame( np.random.rand(shape[0], shape[1]), columns=pd.MultiIndex.from_tuples( diff --git a/src/anndata/typing.py b/src/anndata/typing.py new file mode 100644 index 000000000..d13927bad --- /dev/null +++ b/src/anndata/typing.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import pandas as pd +from numpy import ma +from scipy import sparse + +from . import abc +from ._core.anndata import AnnData +from .compat import ( + AwkArray, + CupyArray, + CupySparseMatrix, + DaskArray, + H5Array, + SpArray, + ZappyArray, + ZarrArray, +) +from .compat import Index as _Index + +if TYPE_CHECKING: + from typing import TypeAlias + + +__all__ = ["Index", "RWAble", "AxisStorable"] + + +Index = _Index +"""1D or 2D index an :class:`~anndata.AnnData` object can be sliced with.""" + + +ArrayDataStructureType: TypeAlias = ( + np.ndarray + | ma.MaskedArray + | sparse.csr_matrix + | sparse.csc_matrix + | SpArray + | AwkArray + | H5Array + | ZarrArray + | ZappyArray + | abc.CSRDataset + | abc.CSCDataset + | DaskArray + | CupyArray + | CupySparseMatrix +) + + +InMemoryArrayOrScalarType: TypeAlias = ( + pd.DataFrame | np.number | str | ArrayDataStructureType +) + + +AxisStorable: TypeAlias = ( + InMemoryArrayOrScalarType | dict[str, "AxisStorable"] | list["AxisStorable"] +) +"""A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" + +RWAble: TypeAlias = ( + AxisStorable | AnnData | pd.Categorical | pd.api.extensions.ExtensionArray +) +"""A superset of :type:`anndata.typing.AxisStorable` (i.e., including :class:`anndata.AnnData`) which is everything can be read/written by :func:`anndata.io.read_elem` and :func:`anndata.io.write_elem`.""" diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 3ff844054..60dffa87f 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -10,6 +10,8 @@ import pandas as pd from scipy import sparse +import anndata + from ._core.sparse_dataset import BaseCompressedSparseDataset from .compat import CupyArray, CupySparseMatrix, DaskArray, SpArray from .logging import get_logger @@ -409,3 +411,27 @@ def raise_value_error_if_multiindex_columns(df: pd.DataFrame, attr: str): f"Please use a single-level index for {attr}." ) raise ValueError(msg) + + +def module_get_attr_redirect( + attr_name: str, + deprecated_mapping: Mapping[str, str], + old_module_path: str | None = None, +) -> Any: + full_old_module_path = ( + f"anndata{'.' + old_module_path if old_module_path is not None else ''}" + ) + if new_path := deprecated_mapping.get(attr_name): + msg = ( + f"Importing {attr_name} from `{full_old_module_path}` is deprecated. " + f"Import anndata.{new_path} instead." + ) + warnings.warn(msg, FutureWarning) + # hacky import_object_by_name, but we test all these + mod = anndata + while "." in new_path: + mod_name, new_path = new_path.split(".", 1) + mod = getattr(mod, mod_name) + return getattr(mod, new_path) + msg = f"module {full_old_module_path} has no attribute {attr_name!r}" + raise AttributeError(msg) diff --git a/src/testing/anndata/_pytest.py b/src/testing/anndata/_pytest.py index d29ac334e..5b0fd60e0 100644 --- a/src/testing/anndata/_pytest.py +++ b/src/testing/anndata/_pytest.py @@ -32,16 +32,24 @@ def pytest_configure(config: pytest.Config) -> None: @pytest.fixture(autouse=True) -def _suppress_env_for_doctests(request: pytest.FixtureRequest) -> None: +def _anndata_test_env(request: pytest.FixtureRequest) -> None: + import anndata + if isinstance(request.node, pytest.DoctestItem): request.getfixturevalue("_doctest_env") + anndata.settings.reset(anndata.settings._registered_options.keys()) + @pytest.fixture def _doctest_env( request: pytest.FixtureRequest, cache: pytest.Cache, tmp_path: Path ) -> Generator[None, None, None]: - from scanpy import settings + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"Importing read_.* from `anndata` is deprecated" + ) + from scanpy import settings from anndata.compat import chdir from anndata.utils import import_name diff --git a/tests/conftest.py b/tests/conftest.py index 65eff92b1..9054812f5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,21 +1,80 @@ from __future__ import annotations from functools import partial +from typing import TYPE_CHECKING +import dask import joblib import pytest -from dask.base import normalize_seq, normalize_token, tokenize +from dask.base import normalize_token, tokenize +from packaging.version import Version + +if Version(dask.__version__) < Version("2024.8.0"): + from dask.base import normalize_seq +else: + from dask.tokenize import normalize_seq from scipy import sparse import anndata as ad from anndata.tests.helpers import subset_func # noqa: F401 +if TYPE_CHECKING: + from types import EllipsisType + @pytest.fixture def backing_h5ad(tmp_path): return tmp_path / "test.h5ad" +@pytest.fixture( + params=[ + pytest.param((..., (slice(None), slice(None))), id="ellipsis"), + pytest.param(((...,), (slice(None), slice(None))), id="ellipsis_tuple"), + pytest.param( + ((..., slice(0, 10)), (slice(None), slice(0, 10))), id="obs-ellipsis" + ), + pytest.param( + ((slice(0, 10), ...), (slice(0, 10), slice(None))), id="var-ellipsis" + ), + pytest.param( + ((slice(0, 10), slice(0, 10), ...), (slice(0, 10), slice(0, 10))), + id="obs-var-ellipsis", + ), + pytest.param( + ((..., slice(0, 10), slice(0, 10)), (slice(0, 10), slice(0, 10))), + id="ellipsis-obs-var", + ), + pytest.param( + ((slice(0, 10), ..., slice(0, 10)), (slice(0, 10), slice(0, 10))), + id="obs-ellipsis-var", + ), + ] +) +def ellipsis_index_with_equivalent( + request, +) -> tuple[tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice]]: + return request.param + + +@pytest.fixture +def ellipsis_index( + ellipsis_index_with_equivalent: tuple[ + tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice] + ], +) -> tuple[EllipsisType | slice, ...] | EllipsisType: + return ellipsis_index_with_equivalent[0] + + +@pytest.fixture +def equivalent_ellipsis_index( + ellipsis_index_with_equivalent: tuple[ + tuple[EllipsisType | slice, ...] | EllipsisType, tuple[slice, slice] + ], +) -> tuple[slice, slice]: + return ellipsis_index_with_equivalent[1] + + ##################### # Dask tokenization # ##################### diff --git a/tests/test_awkward.py b/tests/test_awkward.py index 0e2254afe..4b3f81d8e 100644 --- a/tests/test_awkward.py +++ b/tests/test_awkward.py @@ -15,6 +15,7 @@ ImplicitModificationWarning, read_h5ad, ) +from anndata.compat import AwkArray from anndata.compat import awkward as ak from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward from anndata.utils import axis_len @@ -249,6 +250,22 @@ def test_awkward_io(tmp_path, array): assert_equal(adata.uns["awk"], adata2.uns["awk"], exact=True) +def test_awkward_io_view(tmp_path): + """Check that views are converted to actual arrays on save, i.e. the _view_args and __list__ parameters are removed""" + adata = gen_adata((3, 3), varm_types=(), obsm_types=(AwkArray,), layers_types=()) + + v = adata[1:] + adata_path = tmp_path / "adata.h5ad" + v.write_h5ad(adata_path) + + adata2 = read_h5ad(adata_path) + # parameters are not fully removed, but set to None + assert ak.parameters(adata2.obsm["awk_2d_ragged"]) == { + "__list__": None, + "_view_args": None, + } + + # @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize( ("arrays", "join", "expected"), diff --git a/tests/test_backed_dense.py b/tests/test_backed_dense.py index 796cad511..3fc19d88a 100644 --- a/tests/test_backed_dense.py +++ b/tests/test_backed_dense.py @@ -10,7 +10,7 @@ import zarr from anndata import AnnData -from anndata._io.specs import write_elem +from anndata.io import write_elem from anndata.tests.helpers import assert_equal if TYPE_CHECKING: diff --git a/tests/test_backed_hdf5.py b/tests/test_backed_hdf5.py index 6cb449e28..19b4ca44d 100644 --- a/tests/test_backed_hdf5.py +++ b/tests/test_backed_hdf5.py @@ -200,8 +200,8 @@ def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): var_idx = subset_func2(mem_adata.var_names) if ( array_type is asarray - and isinstance(obs_idx, (list, np.ndarray, sparse.spmatrix, SpArray)) - and isinstance(var_idx, (list, np.ndarray, sparse.spmatrix, SpArray)) + and isinstance(obs_idx, list | np.ndarray | sparse.spmatrix | SpArray) + and isinstance(var_idx, list | np.ndarray | sparse.spmatrix | SpArray) ): pytest.xfail( "Fancy indexing does not work with multiple arrays on a h5py.Dataset" diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 36a725bf2..2778c76bb 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -14,12 +14,13 @@ from anndata._core.anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset from anndata.compat import CAN_USE_SPARSE_ARRAY, SpArray -from anndata.experimental import read_dispatched, write_elem +from anndata.experimental import read_dispatched from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func if TYPE_CHECKING: from collections.abc import Callable, Generator, Sequence from pathlib import Path + from types import EllipsisType from _pytest.mark import ParameterSet from numpy.typing import ArrayLike, NDArray @@ -127,6 +128,17 @@ def test_backed_indexing( assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X) +def test_backed_ellipsis_indexing( + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + ellipsis_index: tuple[EllipsisType | slice, ...] | EllipsisType, + equivalent_ellipsis_index: tuple[slice, slice], +): + csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + + assert_equal(csr_mem.X[equivalent_ellipsis_index], csr_disk.X[ellipsis_index]) + assert_equal(csr_mem.X[equivalent_ellipsis_index], csc_disk.X[ellipsis_index]) + + def make_randomized_mask(size: int) -> np.ndarray: randomized_mask = np.zeros(size, dtype=bool) inds = np.random.choice(size, 20, replace=False) @@ -258,7 +270,7 @@ def test_dataset_append_memory( f = zarr.open_group(path, "a") else: f = h5py.File(path, "a") - ad._io.specs.write_elem(f, "mtx", a) + ad.io.write_elem(f, "mtx", a) diskmtx = sparse_dataset(f["mtx"]) diskmtx.append(b) @@ -269,6 +281,44 @@ def test_dataset_append_memory( assert_equal(fromdisk, frommem) +@pytest.mark.parametrize("sparse_format", [sparse.csr_matrix, sparse.csc_matrix]) +@pytest.mark.parametrize( + ("subset_func", "subset_func2"), + product( + [ + ad.tests.helpers.array_subset, + ad.tests.helpers.slice_subset, + ad.tests.helpers.array_int_subset, + ad.tests.helpers.array_bool_subset, + ], + repeat=2, + ), +) +def test_read_array( + tmp_path: Path, + sparse_format: Callable[[ArrayLike], sparse.spmatrix], + diskfmt: Literal["h5ad", "zarr"], + subset_func, + subset_func2, +): + path = tmp_path / f"test.{diskfmt.replace('ad', '')}" + a = sparse_format(sparse.random(100, 100)) + obs_idx = subset_func(np.arange(100)) + var_idx = subset_func2(np.arange(100)) + if diskfmt == "zarr": + f = zarr.open_group(path, "a") + else: + f = h5py.File(path, "a") + ad.io.write_elem(f, "mtx", a) + diskmtx = sparse_dataset(f["mtx"]) + if not CAN_USE_SPARSE_ARRAY: + pytest.skip("scipy.sparse.cs{r,c}array not available") + ad.settings.use_sparse_array_on_read = True + assert issubclass(type(diskmtx[obs_idx, var_idx]), SpArray) + ad.settings.use_sparse_array_on_read = False + assert issubclass(type(diskmtx[obs_idx, var_idx]), sparse.spmatrix) + + @pytest.mark.parametrize( ("sparse_format", "append_method"), [ @@ -290,8 +340,8 @@ def test_dataset_append_disk( f = zarr.open_group(path, "a") else: f = h5py.File(path, "a") - ad._io.specs.write_elem(f, "a", a) - ad._io.specs.write_elem(f, "b", b) + ad.io.write_elem(f, "a", a) + ad.io.write_elem(f, "b", b) a_disk = sparse_dataset(f["a"]) b_disk = sparse_dataset(f["b"]) @@ -311,7 +361,7 @@ def test_indptr_cache( path = tmp_path / "test.zarr" a = sparse_format(sparse.random(10, 10)) f = zarr.open_group(path, "a") - ad._io.specs.write_elem(f, "X", a) + ad.io.write_elem(f, "X", a) store = AccessTrackingStore(path) store.initialize_key_trackers(["X/indptr"]) f = zarr.open_group(store, "a") @@ -396,7 +446,7 @@ def test_data_access( path = tmp_path / "test.zarr" a = sparse_format(np.eye(10, 10)) f = zarr.open_group(path, "a") - ad._io.specs.write_elem(f, "X", a) + ad.io.write_elem(f, "X", a) data = f["X/data"][...] del f["X/data"] # chunk one at a time to count properly @@ -442,8 +492,8 @@ def test_wrong_shape( else: f = h5py.File(path, "a") - ad._io.specs.write_elem(f, "a", a_mem) - ad._io.specs.write_elem(f, "b", b_mem) + ad.io.write_elem(f, "a", a_mem) + ad.io.write_elem(f, "b", b_mem) a_disk = sparse_dataset(f["a"]) b_disk = sparse_dataset(f["b"]) @@ -460,7 +510,7 @@ def test_reset_group(tmp_path: Path): else: f = h5py.File(path, "a") - ad._io.specs.write_elem(f, "base", base) + ad.io.write_elem(f, "base", base) disk_mtx = sparse_dataset(f["base"]) with pytest.raises(AttributeError): disk_mtx.group = f @@ -475,7 +525,7 @@ def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): else: f = h5py.File(path, "a") - ad._io.specs.write_elem(f, "base", base) + ad.io.write_elem(f, "base", base) disk_mtx = sparse_dataset(f["base"]) pre_checks = disk_mtx.to_memory() @@ -504,7 +554,7 @@ def test_anndata_sparse_compat(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]) else: f = h5py.File(path, "a") - ad._io.specs.write_elem(f, "/", base) + ad.io.write_elem(f, "/", base) adata = ad.AnnData(sparse_dataset(f["/"])) assert_equal(adata.X, base) @@ -545,11 +595,11 @@ def test_append_overflow_check(group_fn, sparse_class, tmpdir): shape=(1, 2), ) - write_elem(group, "mtx", orig_mtx) + ad.io.write_elem(group, "mtx", orig_mtx) backed = sparse_dataset(group["mtx"]) # Checking for correct caching behaviour - backed.indptr + backed._indptr with pytest.raises( OverflowError, diff --git a/tests/test_base.py b/tests/test_base.py index 277e8c8ab..e1401ed74 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -30,7 +30,7 @@ def test_creation(): AnnData(np.array([[1, 2], [3, 4]])) AnnData(np.array([[1, 2], [3, 4]]), {}, {}) AnnData(ma.array([[1, 2], [3, 4]]), uns=dict(mask=[0, 1, 1, 0])) - AnnData(sp.eye(2)) + AnnData(sp.eye(2, format="csr")) if CAN_USE_SPARSE_ARRAY: AnnData(sp.eye_array(2)) X = np.array([[1, 2, 3], [4, 5, 6]]) @@ -95,7 +95,7 @@ def test_creation_error(src, src_arg, dim_msg, dim, dim_arg, msg: str | None): def test_invalid_X(): with pytest.raises( ValueError, - match=r"X needs to be of one of np\.ndarray.*not \.", + match=r"X needs to be of one of .*not \.", ): AnnData("string is not a valid X") @@ -126,7 +126,7 @@ def test_error_create_from_multiindex_df(attr): def test_create_from_sparse_df(): - s = sp.random(20, 30, density=0.2) + s = sp.random(20, 30, density=0.2, format="csr") obs_names = [f"obs{i}" for i in range(20)] var_names = [f"var{i}" for i in range(30)] df = pd.DataFrame.sparse.from_spmatrix(s, index=obs_names, columns=var_names) @@ -277,7 +277,7 @@ def test_setting_dim_index(dim): mapping_attr = f"{dim}m" orig = gen_adata((5, 5)) - orig.raw = orig + orig.raw = orig.copy() curr = orig.copy() view = orig[:, :] new_idx = pd.Index(list("abcde"), name="letters") @@ -453,7 +453,7 @@ def test_slicing_remove_unused_categories(): def test_slicing_dont_remove_unused_categories(): - with settings.override(should_remove_unused_categories=False): + with settings.override(remove_unused_categories=False): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"]) ) @@ -462,7 +462,7 @@ def test_slicing_dont_remove_unused_categories(): def test_no_uniqueness_check_gives_repeat_indices(): - with settings.override(should_check_uniqueness=False): + with settings.override(check_uniqueness=False): obs_names = ["0", "0", "1", "1"] with warnings.catch_warnings(): warnings.simplefilter("error") @@ -590,7 +590,7 @@ def test_convenience(): adata = adata_sparse.copy() adata.layers["x2"] = adata.X * 2 adata.var["anno2"] = ["p1", "p2", "p3"] - adata.raw = adata + adata.raw = adata.copy() adata.X = adata.X / 2 adata_dense = adata.copy() adata_dense.X = adata_dense.X.toarray() diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 03126284b..e034debd2 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -26,6 +26,7 @@ BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, DASK_MATRIX_PARAMS, + DEFAULT_COL_TYPES, GEN_ADATA_DASK_ARGS, as_dense_dask_array, assert_equal, @@ -494,19 +495,19 @@ def get_obs_els(adata): adata1.obsm = { k: v for k, v in adata1.obsm.items() - if not isinstance(v, (pd.DataFrame, AwkArray)) + if not isinstance(v, pd.DataFrame | AwkArray) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() - if not isinstance(v, (pd.DataFrame, AwkArray)) + if not isinstance(v, pd.DataFrame | AwkArray) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() - if not isinstance(v, (pd.DataFrame, AwkArray)) + if not isinstance(v, pd.DataFrame | AwkArray) } # remove AwkArrays from adata.var, as outer joins are not yet implemented for them for tmp_ad in [adata1, adata2, adata3]: @@ -696,9 +697,9 @@ def test_concatenate_with_raw(): layers=dict(Xs=X4), ) - adata1.raw = adata1 - adata2.raw = adata2 - adata3.raw = adata3 + adata1.raw = adata1.copy() + adata2.raw = adata2.copy() + adata3.raw = adata3.copy() adata_all = AnnData.concatenate(adata1, adata2, adata3) assert isinstance(adata_all.raw, Raw) @@ -712,7 +713,7 @@ def test_concatenate_with_raw(): assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) - adata3.raw = adata4 + adata3.raw = adata4.copy() adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcdz") @@ -1375,8 +1376,9 @@ def test_concat_size_0_axis(axis_name, join_type, merge_strategy, shape): """Regression test for https://github.com/scverse/anndata/issues/526""" axis, axis_name = merge._resolve_axis(axis_name) alt_axis = 1 - axis - a = gen_adata((5, 7)) - b = gen_adata(shape) + col_dtypes = (*DEFAULT_COL_TYPES, pd.StringDtype) + a = gen_adata((5, 7), obs_dtypes=col_dtypes, var_dtypes=col_dtypes) + b = gen_adata(shape, obs_dtypes=col_dtypes, var_dtypes=col_dtypes) expected_size = expected_shape(a, b, axis=axis, join=join_type) @@ -1633,3 +1635,23 @@ def test_concat_on_var_outer_join(array_type): # This shouldn't error # TODO: specify expected result while accounting for null value _ = concat([a, b], join="outer", axis=1) + + +def test_concat_dask_sparse_matches_memory(join_type, merge_strategy): + import dask.array as da + + X = sparse.random(50, 20, density=0.5, format="csr") + X_dask = da.from_array(X, chunks=(5, 20)) + var_names_1 = [f"gene_{i}" for i in range(20)] + var_names_2 = [f"gene_{i}{'_foo' if (i%2) else ''}" for i in range(20, 40)] + + ad1 = AnnData(X=X, var=pd.DataFrame(index=var_names_1)) + ad2 = AnnData(X=X, var=pd.DataFrame(index=var_names_2)) + + ad1_dask = AnnData(X=X_dask, var=pd.DataFrame(index=var_names_1)) + ad2_dask = AnnData(X=X_dask, var=pd.DataFrame(index=var_names_2)) + + res_in_memory = concat([ad1, ad2], join=join_type, merge=merge_strategy) + res_dask = concat([ad1_dask, ad2_dask], join=join_type, merge=merge_strategy) + + assert_equal(res_in_memory, res_dask) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index c2c9eb95a..a05d9a308 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -10,8 +10,8 @@ from anndata import AnnData, concat from anndata._core.merge import _resolve_axis -from anndata.experimental import read_elem, write_elem from anndata.experimental.merge import as_group, concat_on_disk +from anndata.io import read_elem, write_elem from anndata.tests.helpers import ( assert_equal, gen_adata, diff --git a/tests/test_dask.py b/tests/test_dask.py index 59616171b..21db60e7e 100644 --- a/tests/test_dask.py +++ b/tests/test_dask.py @@ -12,7 +12,6 @@ import anndata as ad from anndata._core.anndata import AnnData from anndata.compat import CupyArray, DaskArray -from anndata.experimental import read_elem, write_elem from anndata.experimental.merge import as_group from anndata.tests.helpers import ( GEN_ADATA_DASK_ARGS, @@ -123,10 +122,10 @@ def test_dask_distributed_write(adata, tmp_path, diskfmt): orig = adata if diskfmt == "h5ad": with pytest.raises(ValueError, match=r"Cannot write dask arrays to hdf5"): - write_elem(g, "", orig) + ad.io.write_elem(g, "", orig) return - write_elem(g, "", orig) - curr = read_elem(g) + ad.io.write_elem(g, "", orig) + curr = ad.io.read_elem(g) with pytest.raises(AssertionError): assert_equal(curr.obsm["a"], curr.obsm["b"]) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index 7d4bdf6e2..b5cc44c29 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -11,12 +11,10 @@ import h5py import numpy as np import pytest -import zarr from scipy import sparse -import anndata as ad -from anndata import AnnData -from anndata.experimental import CSRDataset, write_elem +import anndata.experimental +from anndata import AnnData, read from anndata.tests.helpers import assert_equal @@ -27,7 +25,7 @@ def adata(): obs=dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), var=dict(var_names=["a", "b", "c"]), ) - adata.raw = adata + adata.raw = adata.copy() adata.layers["x2"] = adata.X * 2 adata.var["anno2"] = ["p1", "p2", "p3"] adata.X = adata.X / 2 @@ -103,8 +101,8 @@ def test_dtype_warning(): def test_deprecated_write_attribute(tmp_path): pth = tmp_path / "file.h5" A = np.random.randn(20, 10) - from anndata._io.specs import read_elem from anndata._io.utils import read_attribute, write_attribute + from anndata.io import read_elem with h5py.File(pth, "w") as f: with pytest.warns(DeprecationWarning, match=r"write_elem"): @@ -124,39 +122,26 @@ def test_deprecated_read(tmp_path): memory.write_h5ad(tmp_path / "file.h5ad") with pytest.warns(FutureWarning, match=r"`anndata.read` is deprecated"): - from_disk = ad.read(tmp_path / "file.h5ad") + from_disk = read(tmp_path / "file.h5ad") assert_equal(memory, from_disk) -def test_deprecated_sparse_dataset_values(): - import zarr - - from anndata.experimental import sparse_dataset, write_elem - - mtx = sparse.random(50, 50, format="csr") - g = zarr.group() - - write_elem(g, "mtx", mtx) - mtx_backed = sparse_dataset(g["mtx"]) - - with pytest.warns(FutureWarning, match=r"Please use .to_memory()"): - mtx_backed.value - - with pytest.warns(FutureWarning, match=r"Please use .format"): - mtx_backed.format_str - - -def test_deprecated_sparse_dataset(): - from anndata._core.sparse_dataset import SparseDataset - - mem_X = sparse.random(50, 50, format="csr") - g = zarr.group() - write_elem(g, "X", mem_X) - with pytest.warns(FutureWarning, match=r"SparseDataset is deprecated"): - X = SparseDataset(g["X"]) - - assert isinstance(X, CSRDataset) - - with pytest.warns(FutureWarning, match=r"SparseDataset is deprecated"): - assert isinstance(X, SparseDataset) +@pytest.mark.parametrize( + ("old_name", "new_name", "module"), + ( + (old_name, new_name, module) + for module in [anndata, anndata.experimental] + for (old_name, new_name) in module._DEPRECATED.items() + ), +) +def test_warn_on_import_with_redirect(old_name: str, new_name: str, module): + with pytest.warns(FutureWarning, match=rf"Importing {old_name}.*is deprecated"): + getattr(module, old_name) + + +def test_warn_on_deprecated__io_module(): + with pytest.warns( + FutureWarning, match=r"Importing read_h5ad from `anndata._io` is deprecated" + ): + from anndata._io import read_h5ad # noqa diff --git a/tests/test_get_vector.py b/tests/test_get_vector.py index baf0fd7d6..87af324b0 100644 --- a/tests/test_get_vector.py +++ b/tests/test_get_vector.py @@ -36,7 +36,7 @@ def test_amgibuous_keys(): ), ) - adata.raw = adata + adata.raw = adata.copy() for k in var_keys: # These are mostly to check that the test is working diff --git a/tests/test_gpu.py b/tests/test_gpu.py index c6f49a696..8f3c4c250 100644 --- a/tests/test_gpu.py +++ b/tests/test_gpu.py @@ -24,7 +24,7 @@ def test_adata_raw_gpu(): adata = AnnData( X=cupy_sparse.random(500, 50, density=0.01, format="csr", dtype=cp.float32) ) - adata.raw = adata + adata.raw = adata.copy() assert isinstance(adata.raw.X, sparse.csr_matrix) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index adf5a7dce..4645fedd5 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -18,6 +18,7 @@ BASE_MATRIX_PARAMS, CUPY_MATRIX_PARAMS, DASK_MATRIX_PARAMS, + DEFAULT_COL_TYPES, as_cupy, as_cupy_sparse_dask_array, as_dense_cupy_dask_array, @@ -26,6 +27,8 @@ assert_equal, gen_adata, gen_awkward, + gen_random_column, + issubdtype, report_name, ) from anndata.utils import axis_len @@ -89,6 +92,18 @@ def test_gen_awkward(shape, datashape): assert arr.type == arr_type +@pytest.mark.parametrize("dtype", [*DEFAULT_COL_TYPES, pd.StringDtype]) +def test_gen_random_column(dtype): + _, col = gen_random_column(10, dtype) + assert len(col) == 10 + # CategoricalDtypes are the only one specified as instances currently + if isinstance(dtype, pd.CategoricalDtype): + assert issubdtype(col.dtype, pd.CategoricalDtype) + assert col.dtype.ordered == dtype.ordered + else: + assert issubdtype(col.dtype, dtype) + + # Does this work for every warning? def test_report_name(): def raise_error(): diff --git a/tests/test_io_conversion.py b/tests/test_io_conversion.py index 33f50b6d9..217a9cc16 100644 --- a/tests/test_io_conversion.py +++ b/tests/test_io_conversion.py @@ -39,7 +39,7 @@ def test_sparse_to_dense_disk(tmp_path, mtx_format, to_convert): dense_from_mem_pth = tmp_path / "dense_mem.h5ad" dense_from_disk_pth = tmp_path / "dense_disk.h5ad" mem = gen_adata((50, 50), mtx_format) - mem.raw = mem + mem.raw = mem.copy() mem.write_h5ad(mem_pth) disk = ad.read_h5ad(mem_pth, backed="r") @@ -66,7 +66,7 @@ def test_sparse_to_dense_disk(tmp_path, mtx_format, to_convert): def test_sparse_to_dense_inplace(tmp_path, spmtx_format): pth = tmp_path / "adata.h5ad" orig = gen_adata((50, 50), spmtx_format) - orig.raw = orig + orig.raw = orig.copy() orig.write(pth) backed = ad.read_h5ad(pth, backed="r+") backed.write(as_dense=("X", "raw/X")) @@ -97,7 +97,7 @@ def test_sparse_to_dense_errors(tmp_path): def test_dense_to_sparse_memory(tmp_path, spmtx_format, to_convert): dense_path = tmp_path / "dense.h5ad" orig = gen_adata((50, 50), np.array) - orig.raw = orig + orig.raw = orig.copy() orig.write_h5ad(dense_path) assert not isinstance(orig.X, sparse.spmatrix) assert not isinstance(orig.raw.X, sparse.spmatrix) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 4dba9b6aa..c0d60c18d 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -8,12 +8,7 @@ import anndata as ad from anndata.compat import SpArray -from anndata.experimental import ( - read_dispatched, - read_elem, - write_dispatched, - write_elem, -) +from anndata.experimental import read_dispatched, write_dispatched from anndata.tests.helpers import assert_equal, gen_adata @@ -29,7 +24,7 @@ def read_only_axis_dfs(func, elem_name: str, elem, iospec): adata = gen_adata((1000, 100)) z = zarr.group() - write_elem(z, "/", adata) + ad.io.write_elem(z, "/", adata) expected = ad.AnnData(obs=adata.obs, var=adata.var) actual = read_dispatched(z, read_only_axis_dfs) @@ -48,7 +43,7 @@ def read_as_dask_array(func, elem_name: str, elem, iospec): "awkward-array", }: # Preventing recursing inside of these types - return read_elem(elem) + return ad.io.read_elem(elem) elif iospec.encoding_type == "array": return da.from_zarr(elem) else: @@ -56,7 +51,7 @@ def read_as_dask_array(func, elem_name: str, elem, iospec): adata = gen_adata((1000, 100)) z = zarr.group() - write_elem(z, "/", adata) + ad.io.write_elem(z, "/", adata) dask_adata = read_dispatched(z, read_as_dask_array) @@ -64,7 +59,7 @@ def read_as_dask_array(func, elem_name: str, elem, iospec): assert isinstance(dask_adata.obsm["array"], da.Array) assert isinstance(dask_adata.uns["nested"]["nested_further"]["array"], da.Array) - expected = read_elem(z) + expected = ad.io.read_elem(z) actual = dask_adata.to_memory(copy=False) assert_equal(expected, actual) @@ -73,10 +68,10 @@ def read_as_dask_array(func, elem_name: str, elem, iospec): def test_read_dispatched_null_case(): adata = gen_adata((100, 100)) z = zarr.group() - write_elem(z, "/", adata) + ad.io.write_elem(z, "/", adata) - expected = read_elem(z) - actual = read_dispatched(z, lambda _, __, x, **___: read_elem(x)) + expected = ad.io.read_elem(z) + actual = read_dispatched(z, lambda _, __, x, **___: ad.io.read_elem(x)) assert_equal(expected, actual) @@ -101,7 +96,7 @@ def set_copy(d, **kwargs): # TODO: Should the passed path be absolute? path = "/" + store.path + "/" + k if hasattr(elem, "shape") and not isinstance( - elem, (sparse.spmatrix, SpArray, ad.AnnData) + elem, sparse.spmatrix | SpArray | ad.AnnData ): if re.match(r"^/((X)|(layers)).*", path): chunks = (M, N) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index a4d614c6f..d93b9937c 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -20,12 +20,11 @@ _REGISTRY, IOSpec, get_spec, - read_elem, - read_elem_as_dask, - write_elem, ) from anndata._io.specs.registry import IORegistryError -from anndata.compat import ZarrGroup, _read_attr +from anndata.compat import CAN_USE_SPARSE_ARRAY, SpArray, ZarrGroup, _read_attr +from anndata.experimental import read_elem_as_dask +from anndata.io import read_elem, write_elem from anndata.tests.helpers import ( as_cupy, as_cupy_sparse_dask_array, @@ -35,6 +34,7 @@ ) if TYPE_CHECKING: + from pathlib import Path from typing import Literal, TypeVar from anndata.compat import H5Group @@ -138,15 +138,34 @@ def create_sparse_store( id="sp_mat_csc", ), pytest.param(pd.DataFrame({"a": [1, 2, 3]}), "dataframe", id="pd_df"), - pytest.param(pd.Categorical(list("aabccedd")), "categorical", id="pd_cat"), + pytest.param( + pd.Categorical(list("aabccedd") + [pd.NA]), + "categorical", + id="pd_cat_np_str", + ), pytest.param( pd.Categorical(list("aabccedd"), ordered=True), "categorical", - id="pd_cat_ord", + id="pd_cat_np_str_ord", + ), + pytest.param( + pd.array(list("aabccedd") + [pd.NA], dtype="string").astype("category"), + "categorical", + id="pd_cat_pd_str", ), pytest.param( pd.Categorical([1, 2, 1, 3], ordered=True), "categorical", id="pd_cat_num" ), + pytest.param( + pd.array(["hello", "world"], dtype="string"), + "nullable-string-array", + id="pd_arr_str", + ), + pytest.param( + pd.array(["hello", "world", pd.NA], dtype="string"), + "nullable-string-array", + id="pd_arr_str_mask", + ), pytest.param( pd.arrays.IntegerArray( np.ones(5, dtype=int), mask=np.array([True, False, True, False, True]) @@ -187,6 +206,8 @@ def create_sparse_store( ], ) def test_io_spec(store, value, encoding_type): + ad.settings.allow_write_nullable_strings = True + key = f"key_for_{encoding_type}" write_elem(store, key, value, dataset_kwargs={}) @@ -275,6 +296,8 @@ def test_read_lazy_2d_dask(sparse_format, store): (2, (200, 400)), (1, None), (2, None), + (2, (400, -1)), + (2, (400, None)), ], ) def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): @@ -307,28 +330,36 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): @pytest.mark.parametrize( - ("arr_type", "chunks"), + ("arr_type", "chunks", "expected_chunksize"), [ - ("dense", (100, 100)), - ("csc", (SIZE, 10)), - ("csr", (10, SIZE * 2)), - ("csc", None), - ("csr", None), + ("dense", (100, 100), (100, 100)), + ("csc", (SIZE, 10), (SIZE, 10)), + ("csr", (10, SIZE * 2), (10, SIZE * 2)), + ("csc", None, (SIZE, 1000)), + ("csr", None, (1000, SIZE * 2)), + ("csr", (10, -1), (10, SIZE * 2)), + ("csc", (-1, 10), (SIZE, 10)), + ("csr", (10, None), (10, SIZE * 2)), + ("csc", (None, 10), (SIZE, 10)), + ("csc", (None, None), (SIZE, SIZE * 2)), + ("csr", (None, None), (SIZE, SIZE * 2)), + ("csr", (-1, -1), (SIZE, SIZE * 2)), + ("csc", (-1, -1), (SIZE, SIZE * 2)), ], ) -def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): +def test_read_lazy_2d_chunk_kwargs( + store: H5Group | ZarrGroup, + arr_type: Literal["csr", "csc", "dense"], + chunks: None | tuple[int | None, int | None], + expected_chunksize: tuple[int, int], +): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - if chunks is not None: - assert X_dask_from_disk.chunksize == chunks - else: - minor_index = int(arr_type == "csr") - # assert that sparse chunks are set correctly by default - assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) + assert X_dask_from_disk.chunksize == expected_chunksize X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) @@ -365,7 +396,7 @@ def test_write_indptr_dtype_override(store, sparse_format): def test_io_spec_raw(store): adata = gen_adata((3, 2)) - adata.raw = adata + adata.raw = adata.copy() write_elem(store, "adata", adata) @@ -422,6 +453,11 @@ def test_write_io_error(store, obj): assert re.search(full_pattern, msg) +def test_write_nullable_string_error(store): + with pytest.raises(RuntimeError, match=r"allow_write_nullable_strings.*is False"): + write_elem(store, "/el", pd.array([""], dtype="string")) + + def test_categorical_order_type(store): # https://github.com/scverse/anndata/issues/853 cat = pd.Categorical([0, 1], ordered=True) @@ -556,3 +592,22 @@ def test_io_pd_cow(store, copy_on_write): write_elem(store, "adata", orig) from_store = read_elem(store["adata"]) assert_equal(orig, from_store) + + +def test_read_sparse_array( + tmp_path: Path, + sparse_format: Literal["csr", "csc"], + diskfmt: Literal["h5ad", "zarr"], +): + path = tmp_path / f"test.{diskfmt.replace('ad', '')}" + a = sparse.random(100, 100, format=sparse_format) + if diskfmt == "zarr": + f = zarr.open_group(path, "a") + else: + f = h5py.File(path, "a") + ad.io.write_elem(f, "mtx", a) + if not CAN_USE_SPARSE_ARRAY: + pytest.skip("scipy.sparse.cs{r,c}array not available") + ad.settings.use_sparse_array_on_read = True + mtx = ad.io.read_elem(f["mtx"]) + assert issubclass(type(mtx), SpArray) diff --git a/tests/test_io_partial.py b/tests/test_io_partial.py index d43aaca1c..76ff05627 100644 --- a/tests/test_io_partial.py +++ b/tests/test_io_partial.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from importlib.util import find_spec from pathlib import Path @@ -10,9 +11,8 @@ from scipy.sparse import csr_matrix from anndata import AnnData -from anndata._io import write_h5ad, write_zarr -from anndata._io.specs import read_elem from anndata._io.specs.registry import read_elem_partial +from anndata.io import read_elem, write_h5ad, write_zarr X = np.array([[1.0, 0.0, 3.0], [4.0, 0.0, 6.0], [0.0, 8.0, 0.0]], dtype="float32") X_check = np.array([[4.0, 0.0], [0.0, 8.0]], dtype="float32") @@ -44,7 +44,11 @@ def test_read_partial_X(tmp_path, typ, accessor): @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") @pytest.mark.parametrize("accessor", ["h5ad", "zarr"]) def test_read_partial_adata(tmp_path, accessor): - import scanpy as sc + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"Importing read_.* from `anndata` is deprecated" + ) + import scanpy as sc adata = sc.datasets.pbmc68k_reduced() diff --git a/tests/test_io_utils.py b/tests/test_io_utils.py index f50249bad..25c66f46d 100644 --- a/tests/test_io_utils.py +++ b/tests/test_io_utils.py @@ -12,7 +12,6 @@ from anndata._io.specs.registry import IORegistryError from anndata._io.utils import report_read_key_on_error from anndata.compat import _clean_uns -from anndata.experimental import read_elem, write_elem if TYPE_CHECKING: from collections.abc import Callable @@ -108,10 +107,10 @@ class Foo: pattern = r"(?s)^((?!Error raised while writing key '/?a').)*$" with pytest.raises(IORegistryError, match=pattern): - write_elem(group, "/", {"a": {"b": Foo()}}) + ad.io.write_elem(group, "/", {"a": {"b": Foo()}}) - write_elem(group, "/", {"a": {"b": [1, 2, 3]}}) + ad.io.write_elem(group, "/", {"a": {"b": [1, 2, 3]}}) group["a/b"].attrs["encoding-type"] = "not a real encoding type" with pytest.raises(IORegistryError, match=pattern): - read_elem(group) + ad.io.read_elem(group) diff --git a/tests/test_io_warnings.py b/tests/test_io_warnings.py index 29ab2d963..0e3848168 100644 --- a/tests/test_io_warnings.py +++ b/tests/test_io_warnings.py @@ -15,7 +15,11 @@ @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") def test_old_format_warning_thrown(): - import scanpy as sc + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"Importing read_.* from `anndata` is deprecated" + ) + import scanpy as sc pth = Path(sc.datasets.__file__).parent / "10x_pbmc68k_reduced.h5ad" # TODO: with Pytest 8, all this can be a diff --git a/tests/test_obsmvarm.py b/tests/test_obsmvarm.py index 91516de2f..d79c7bb5a 100644 --- a/tests/test_obsmvarm.py +++ b/tests/test_obsmvarm.py @@ -85,21 +85,21 @@ def test_setting_dataframe(adata: AnnData): def test_setting_sparse(adata: AnnData): - obsm_sparse = sparse.random(M, 100) + obsm_sparse = sparse.random(M, 100, format="csr") adata.obsm["a"] = obsm_sparse assert not np.any((adata.obsm["a"] != obsm_sparse).data) - varm_sparse = sparse.random(N, 100) + varm_sparse = sparse.random(N, 100, format="csr") adata.varm["a"] = varm_sparse assert not np.any((adata.varm["a"] != varm_sparse).data) h = joblib.hash(adata) - bad_obsm_sparse = sparse.random(M * 2, M) + bad_obsm_sparse = sparse.random(M * 2, M, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsm["b"] = bad_obsm_sparse - bad_varm_sparse = sparse.random(N * 2, N) + bad_varm_sparse = sparse.random(N * 2, N, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.varm["b"] = bad_varm_sparse diff --git a/tests/test_obspvarp.py b/tests/test_obspvarp.py index 311a8d2bb..42fc47172 100644 --- a/tests/test_obspvarp.py +++ b/tests/test_obspvarp.py @@ -65,21 +65,21 @@ def test_setting_ndarray(adata: AnnData): def test_setting_sparse(adata: AnnData): - obsp_sparse = sparse.random(M, M) + obsp_sparse = sparse.random(M, M, format="csr") adata.obsp["a"] = obsp_sparse assert not np.any((adata.obsp["a"] != obsp_sparse).data) - varp_sparse = sparse.random(N, N) + varp_sparse = sparse.random(N, N, format="csr") adata.varp["a"] = varp_sparse assert not np.any((adata.varp["a"] != varp_sparse).data) h = joblib.hash(adata) - bad_obsp_sparse = sparse.random(M * 2, M) + bad_obsp_sparse = sparse.random(M * 2, M, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.obsp["b"] = bad_obsp_sparse - bad_varp_sparse = sparse.random(N * 2, N) + bad_varp_sparse = sparse.random(N * 2, N, format="csr") with pytest.raises(ValueError, match=r"incorrect shape"): adata.varp["b"] = bad_varp_sparse diff --git a/tests/test_raw.py b/tests/test_raw.py index a21f21f8a..d0ee86833 100644 --- a/tests/test_raw.py +++ b/tests/test_raw.py @@ -38,7 +38,7 @@ def adata_raw() -> ad.AnnData: adata = ad.AnnData( np.array(data, dtype="int32"), obs=obs_dict, var=var_dict, uns=uns_dict ) - adata.raw = adata + adata.raw = adata.copy() # Make them different shapes adata = adata[:, [0, 1]].copy() return adata @@ -131,7 +131,7 @@ def test_raw_as_parent_view(): # https://github.com/scverse/anndata/issues/288 a = ad.AnnData(np.ones((4, 3))) a.varm["PCs"] = np.ones((3, 3)) - a.raw = a + a.raw = a.copy() # create a Raw containing views. This used to trigger #288. b = a.raw[:, "0"] # actualize @@ -165,3 +165,10 @@ def test_to_adata_populates_obs(): from_raw = adata_w_raw.raw.to_adata() assert_equal(adata, from_raw) + + +def test_no_copy(): + adata = gen_adata((20, 10), X_type=np.asarray) + adata.raw = adata # no .copy() herer + np.log1p(adata.X, out=adata.X) + assert adata.X is adata.raw.X diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 8ad2870c4..518559995 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -3,6 +3,7 @@ import re import warnings from contextlib import contextmanager +from functools import partial from importlib.util import find_spec from pathlib import Path from string import ascii_letters @@ -25,6 +26,7 @@ if TYPE_CHECKING: from os import PathLike + from typing import Literal HERE = Path(__file__).parent @@ -133,7 +135,7 @@ def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwa X = typ(X_list) adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) assert not isinstance(adata_src.obs["oanno1"].dtype, pd.CategoricalDtype) - adata_src.raw = adata_src + adata_src.raw = adata_src.copy() if storage == "h5ad": adata_src.write(backing_h5ad, **dataset_kwargs) @@ -161,16 +163,16 @@ def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwa if isinstance(adata_src.raw.X, SpArray): assert isinstance(adata.raw.X, sparse.spmatrix) else: - assert isinstance(adata_src.raw.X, (type(adata.raw.X), DaskArray)) + assert isinstance(adata_src.raw.X, type(adata.raw.X) | DaskArray) assert isinstance( - adata_src.uns["uns4"]["c"], (type(adata.uns["uns4"]["c"]), DaskArray) + adata_src.uns["uns4"]["c"], type(adata.uns["uns4"]["c"]) | DaskArray ) - assert isinstance(adata_src.varm, (type(adata.varm), DaskArray)) + assert isinstance(adata_src.varm, type(adata.varm) | DaskArray) assert_equal(adata.raw.X, adata_src.raw.X) pd.testing.assert_frame_equal(adata.raw.var, adata_src.raw.var) - assert isinstance(adata.uns["uns4"]["a"], (int, np.integer)) - assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer)) + assert isinstance(adata.uns["uns4"]["a"], int | np.integer) + assert isinstance(adata_src.uns["uns4"]["a"], int | np.integer) assert_equal(adata, adata_src) @@ -242,7 +244,7 @@ def test_readwrite_equivalent_h5ad_zarr(tmp_path, typ): M, N = 100, 101 adata = gen_adata((M, N), X_type=typ) - adata.raw = adata + adata.raw = adata.copy() adata.write_h5ad(h5ad_pth) adata.write_zarr(zarr_pth) @@ -339,7 +341,7 @@ def test_zarr_compression(tmp_path): compressor = Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE) not_compressed = [] - ad._io.write_zarr(pth, adata, compressor=compressor) + ad.io.write_zarr(pth, adata, compressor=compressor) def check_compressed(key, value): if isinstance(value, zarr.Array) and value.shape != (): @@ -405,7 +407,7 @@ def test_readwrite_loom(typ, obsm_mapping, varm_mapping, tmp_path): ) adata_src.write_loom(tmp_path / "test.loom", write_obsm_varm=True) - adata = ad.read_loom( + adata = ad.io.read_loom( tmp_path / "test.loom", sparse=typ is csr_matrix, obsm_mapping=obsm_mapping, @@ -455,37 +457,37 @@ def test_readloom_deprecations(tmp_path): # obsm_names -> obsm_mapping obsm_mapping = {"df": adata_src.obs.columns} with pytest.warns(FutureWarning): - depr_result = ad.read_loom(loom_pth, obsm_names=obsm_mapping) - actual_result = ad.read_loom(loom_pth, obsm_mapping=obsm_mapping) + depr_result = ad.io.read_loom(loom_pth, obsm_names=obsm_mapping) + actual_result = ad.io.read_loom(loom_pth, obsm_mapping=obsm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match=r"ambiguous"), pytest.warns(FutureWarning): - ad.read_loom(loom_pth, obsm_mapping=obsm_mapping, obsm_names=obsm_mapping) + ad.io.read_loom(loom_pth, obsm_mapping=obsm_mapping, obsm_names=obsm_mapping) # varm_names -> varm_mapping varm_mapping = {"df": adata_src.var.columns} with pytest.warns(FutureWarning): - depr_result = ad.read_loom(loom_pth, varm_names=varm_mapping) - actual_result = ad.read_loom(loom_pth, varm_mapping=varm_mapping) + depr_result = ad.io.read_loom(loom_pth, varm_names=varm_mapping) + actual_result = ad.io.read_loom(loom_pth, varm_mapping=varm_mapping) assert_equal(actual_result, depr_result) with pytest.raises(ValueError, match=r"ambiguous"), pytest.warns(FutureWarning): - ad.read_loom(loom_pth, varm_mapping=varm_mapping, varm_names=varm_mapping) + ad.io.read_loom(loom_pth, varm_mapping=varm_mapping, varm_names=varm_mapping) # positional -> keyword with pytest.warns(FutureWarning, match=r"sparse"): - depr_result = ad.read_loom(loom_pth, True) - actual_result = ad.read_loom(loom_pth, sparse=True) + depr_result = ad.io.read_loom(loom_pth, True) + actual_result = ad.io.read_loom(loom_pth, sparse=True) assert type(depr_result.X) == type(actual_result.X) def test_read_csv(): - adata = ad.read_csv(HERE / "data" / "adata.csv") + adata = ad.io.read_csv(HERE / "data" / "adata.csv") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list def test_read_tsv_strpath(): - adata = ad.read_text(str(HERE / "data" / "adata-comments.tsv"), "\t") + adata = ad.io.read_text(str(HERE / "data" / "adata-comments.tsv"), "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list @@ -493,7 +495,7 @@ def test_read_tsv_strpath(): def test_read_tsv_iter(): with (HERE / "data" / "adata-comments.tsv").open() as f: - adata = ad.read_text(f, "\t") + adata = ad.io.read_text(f, "\t") assert adata.obs_names.tolist() == ["r1", "r2", "r3"] assert adata.var_names.tolist() == ["c1", "c2"] assert adata.X.tolist() == X_list @@ -541,14 +543,14 @@ def hash_dir_contents(dir: Path) -> dict[str, bytes]: @pytest.mark.parametrize( ("read", "write", "name"), [ - pytest.param(ad.read_h5ad, ad._io.write_h5ad, "test_empty.h5ad"), + pytest.param(ad.read_h5ad, ad.io.write_h5ad, "test_empty.h5ad"), pytest.param( - ad.read_loom, - ad._io.write_loom, + ad.io.read_loom, + ad.io.write_loom, "test_empty.loom", marks=pytest.mark.xfail(reason="Loom can’t handle 0×0 matrices"), ), - pytest.param(ad.read_zarr, ad._io.write_zarr, "test_empty.zarr"), + pytest.param(ad.read_zarr, ad.io.write_zarr, "test_empty.zarr"), ], ) def test_readwrite_empty(read, write, name, tmp_path): @@ -565,12 +567,12 @@ def test_read_excel(): message=r"datetime.datetime.utcnow\(\) is deprecated", category=DeprecationWarning, ) - adata = ad.read_excel(HERE / "data/excel.xlsx", "Sheet1", dtype=int) + adata = ad.io.read_excel(HERE / "data/excel.xlsx", "Sheet1", dtype=int) assert adata.X.tolist() == X_list def test_read_umi_tools(): - adata = ad.read_umi_tools(HERE / "data/umi_tools.tsv.gz") + adata = ad.io.read_umi_tools(HERE / "data/umi_tools.tsv.gz") assert adata.obs_names.name == "cell" assert adata.var_names.name == "gene" assert adata.shape == (2, 13) @@ -658,30 +660,13 @@ def random_cats(n): assert_equal(orig, curr) -def test_write_string_types(tmp_path, diskfmt): - # https://github.com/scverse/anndata/issues/456 - adata_pth = tmp_path / f"adata.{diskfmt}" - - adata = ad.AnnData( - obs=pd.DataFrame( - np.ones((3, 2)), - columns=["a", np.str_("b")], - index=["a", "b", "c"], - ), - ) - - write = getattr(adata, f"write_{diskfmt}") - read = getattr(ad, f"read_{diskfmt}") - - write(adata_pth) - from_disk = read(adata_pth) - - assert_equal(adata, from_disk) - +def test_write_string_type_error(tmp_path, diskfmt): + adata = ad.AnnData(obs=dict(obs_names=list("abc"))) adata.obs[b"c"] = np.zeros(3) + # This should error, and tell you which key is at fault with pytest.raises(TypeError, match=r"writing key 'obs'") as exc_info: - write(adata_pth) + getattr(adata, f"write_{diskfmt}")(tmp_path / f"adata.{diskfmt}") assert "b'c'" in str(exc_info.value) @@ -722,38 +707,68 @@ def test_zarr_chunk_X(tmp_path): # Round-tripping scanpy datasets ################################ -diskfmt2 = diskfmt + +def _do_roundtrip( + adata: ad.AnnData, pth: Path, diskfmt: Literal["h5ad", "zarr"] +) -> ad.AnnData: + getattr(adata, f"write_{diskfmt}")(pth) + return getattr(ad, f"read_{diskfmt}")(pth) + + +@pytest.fixture +def roundtrip(diskfmt): + return partial(_do_roundtrip, diskfmt=diskfmt) + + +def test_write_string_types(tmp_path, diskfmt, roundtrip): + # https://github.com/scverse/anndata/issues/456 + adata_pth = tmp_path / f"adata.{diskfmt}" + + adata = ad.AnnData( + obs=pd.DataFrame( + np.ones((3, 2)), + columns=["a", np.str_("b")], + index=["a", "b", "c"], + ), + ) + + from_disk = roundtrip(adata, adata_pth) + + assert_equal(adata, from_disk) @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") -def test_scanpy_pbmc68k(tmp_path, diskfmt, diskfmt2): - read1 = lambda pth: getattr(ad, f"read_{diskfmt}")(pth) - write1 = lambda adata, pth: getattr(adata, f"write_{diskfmt}")(pth) - read2 = lambda pth: getattr(ad, f"read_{diskfmt2}")(pth) - write2 = lambda adata, pth: getattr(adata, f"write_{diskfmt2}")(pth) +def test_scanpy_pbmc68k(tmp_path, diskfmt, roundtrip, diskfmt2): + roundtrip2 = partial(_do_roundtrip, diskfmt=diskfmt2) filepth1 = tmp_path / f"test1.{diskfmt}" filepth2 = tmp_path / f"test2.{diskfmt2}" - import scanpy as sc + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"Importing read_.* from `anndata` is deprecated" + ) + import scanpy as sc with warnings.catch_warnings(): warnings.simplefilter("ignore", ad.OldFormatWarning) pbmc = sc.datasets.pbmc68k_reduced() - write1(pbmc, filepth1) - from_disk1 = read1(filepth1) # Do we read okay - write2(from_disk1, filepth2) # Can we round trip - from_disk2 = read2(filepth2) + from_disk1 = roundtrip(pbmc, filepth1) # Do we read okay + from_disk2 = roundtrip2(from_disk1, filepth2) # Can we round trip assert_equal(pbmc, from_disk1) # Not expected to be exact due to `nan`s assert_equal(pbmc, from_disk2) @pytest.mark.skipif(not find_spec("scanpy"), reason="Scanpy is not installed") -def test_scanpy_krumsiek11(tmp_path, diskfmt): +def test_scanpy_krumsiek11(tmp_path, diskfmt, roundtrip): filepth = tmp_path / f"test.{diskfmt}" - import scanpy as sc + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"Importing read_.* from `anndata` is deprecated" + ) + import scanpy as sc # TODO: this should be fixed in scanpy instead with pytest.warns(UserWarning, match=r"Observation names are not unique"): @@ -761,11 +776,10 @@ def test_scanpy_krumsiek11(tmp_path, diskfmt): del orig.uns["highlights"] # Can’t write int keys # Can’t write "string" dtype: https://github.com/scverse/anndata/issues/679 orig.obs["cell_type"] = orig.obs["cell_type"].astype(str) - getattr(orig, f"write_{diskfmt}")(filepth) with pytest.warns(UserWarning, match=r"Observation names are not unique"): - read = getattr(ad, f"read_{diskfmt}")(filepth) + curr = roundtrip(orig, filepth) - assert_equal(orig, read, exact=True) + assert_equal(orig, curr, exact=True) # Checking if we can read legacy zarr files @@ -777,7 +791,11 @@ def test_scanpy_krumsiek11(tmp_path, diskfmt): reason="File not present.", ) def test_backwards_compat_zarr(): - import scanpy as sc + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message=r"Importing read_.* from `anndata` is deprecated" + ) + import scanpy as sc import zarr pbmc_orig = sc.datasets.pbmc68k_reduced() @@ -796,11 +814,8 @@ def test_backwards_compat_zarr(): assert_equal(pbmc_zarr, pbmc_orig) -# TODO: use diskfmt fixture once zarr backend implemented -def test_adata_in_uns(tmp_path, diskfmt): +def test_adata_in_uns(tmp_path, diskfmt, roundtrip): pth = tmp_path / f"adatas_in_uns.{diskfmt}" - read = lambda pth: getattr(ad, f"read_{diskfmt}")(pth) - write = lambda adata, pth: getattr(adata, f"write_{diskfmt}")(pth) orig = gen_adata((4, 5)) orig.uns["adatas"] = { @@ -811,20 +826,16 @@ def test_adata_in_uns(tmp_path, diskfmt): another_one.raw = gen_adata((2, 7)) orig.uns["adatas"]["b"].uns["another_one"] = another_one - write(orig, pth) - curr = read(pth) + curr = roundtrip(orig, pth) assert_equal(orig, curr) -def test_io_dtype(tmp_path, diskfmt, dtype): +def test_io_dtype(tmp_path, diskfmt, dtype, roundtrip): pth = tmp_path / f"adata_dtype.{diskfmt}" - read = lambda pth: getattr(ad, f"read_{diskfmt}")(pth) - write = lambda adata, pth: getattr(adata, f"write_{diskfmt}")(pth) orig = ad.AnnData(np.ones((5, 8), dtype=dtype)) - write(orig, pth) - curr = read(pth) + curr = roundtrip(orig, pth) assert curr.X.dtype == dtype diff --git a/tests/test_settings.py b/tests/test_settings.py index 871141d92..ba7dba8f9 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -243,3 +243,16 @@ class TestEnum(Enum): ) def test_describe(as_rst: bool, expected: str, settings: SettingsManager): assert settings.describe("test_var_3", as_rst=as_rst) == expected + + +def test_use_sparse_array_on_read(): + import anndata as ad + + if not ad.compat.CAN_USE_SPARSE_ARRAY: + with pytest.raises( + ValueError, + match=r"scipy.sparse.cs{r,c}array is not available in current scipy version", + ): + ad.settings.use_sparse_array_on_read = True + else: + ad.settings.use_sparse_array_on_read = True diff --git a/tests/test_transpose.py b/tests/test_transpose.py index 720733496..e672cf13d 100644 --- a/tests/test_transpose.py +++ b/tests/test_transpose.py @@ -24,7 +24,7 @@ def test_transpose_orig(): def _add_raw(adata, *, var_subset=slice(None)): new = adata[:, var_subset].copy() - new.raw = adata + new.raw = adata.copy() return new diff --git a/tests/test_views.py b/tests/test_views.py index 2d4a0a78d..6e57e08c7 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -3,6 +3,7 @@ from contextlib import ExitStack from copy import deepcopy from operator import mul +from typing import TYPE_CHECKING import joblib import numpy as np @@ -35,6 +36,9 @@ ) from anndata.utils import asarray +if TYPE_CHECKING: + from types import EllipsisType + IGNORE_SPARSE_EFFICIENCY_WARNING = pytest.mark.filterwarnings( "ignore:Changing the sparsity structure:scipy.sparse.SparseEfficiencyWarning" ) @@ -525,7 +529,7 @@ def test_layers_view(): # TODO: This can be flaky. Make that stop def test_view_of_view(matrix_type, subset_func, subset_func2): adata = gen_adata((30, 15), X_type=matrix_type) - adata.raw = adata + adata.raw = adata.copy() if subset_func is single_subset: pytest.xfail("Other subset generating functions have trouble with this") var_s1 = subset_func(adata.var_names, min_size=4) @@ -786,6 +790,30 @@ def test_dataframe_view_index_setting(): assert a2.obs.index.values.tolist() == ["a", "b"] +def test_ellipsis_index( + ellipsis_index: tuple[EllipsisType | slice, ...] | EllipsisType, + equivalent_ellipsis_index: tuple[slice, slice], + matrix_type, +): + adata = gen_adata((10, 10), X_type=matrix_type, **GEN_ADATA_DASK_ARGS) + subset_ellipsis = adata[ellipsis_index] + subset = adata[equivalent_ellipsis_index] + assert_equal(subset_ellipsis, subset) + + +@pytest.mark.parametrize( + ("index", "expected_error"), + [ + ((..., 0, ...), r"only have a single ellipsis"), + ((0, 0, 0), r"Received a length 3 index"), + ], + ids=["ellipsis-int-ellipsis", "int-int-int"], +) +def test_index_3d_errors(index: tuple[int | EllipsisType, ...], expected_error: str): + with pytest.raises(IndexError, match=expected_error): + gen_adata((10, 10))[index] + + # @pytest.mark.parametrize("dim", ["obs", "var"]) # @pytest.mark.parametrize( # ("idx", "pat"), diff --git a/tests/test_x.py b/tests/test_x.py index 2b4504158..64b1bb87d 100644 --- a/tests/test_x.py +++ b/tests/test_x.py @@ -182,3 +182,12 @@ def test_set_dense_x_view_from_sparse(): assert_equal(view.X, x1[:30]) assert_equal(orig.X[:30], x1[:30]) # change propagates through assert_equal(orig.X[30:], x[30:]) # change propagates through + + +def test_warn_on_non_csr_csc_matrix(): + X = sparse.eye(100) + with pytest.warns( + FutureWarning, + match=rf"AnnData previously had undefined behavior around matrices of type {type(X)}.*", + ): + ad.AnnData(X=X)