Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
What's New
==========

.. _whats-new.2025.08.1:
.. _whats-new.2025.09.0:

v2025.08.1 (unreleased)
v2025.09.0 (unreleased)
-----------------------

New Features
Expand Down Expand Up @@ -36,6 +36,20 @@ Breaking changes
if non default values are provided in this context (:issue:`10640`,
:pull:`10650`). By `Spencer Clark <https://github.com/spencerkclark>`_.

- The default backend ``engine`` used by :py:meth:`Dataset.to_netcdf`
and :py:meth:`DataTree.to_netcdf` is now chosen consistently with
:py:func:`open_dataset` and :py:func:`open_datatree`, using whichever netCDF
libraries are available and valid, and preferring netCDF4 to h5netcdf to scipy
(:issue:`10654`). This will change the default backend in some edge cases
(e.g., from scipy to netCDF4 when writing to a file-like object or bytes). To
override these new defaults, set ``engine`` explicitly.
By `Stephan Hoyer <https://github.com/shoyer>`_.
- The return value of :py:meth:`Dataset.to_netcdf` without ``path`` is now a
``memoryview`` object instead of ``bytes`` (:pull:`10656`). This removes an
unnecessary memory copy and ensures consistency when using either
``engine="scipy"`` or ``engine="h5netcdf"``. If you need a bytes object,
simply wrap the return value of ``to_netcdf()`` with ``bytes()``.
By `Stephan Hoyer <https://github.com/shoyer>`_.

Deprecations
~~~~~~~~~~~~
Expand All @@ -54,6 +68,10 @@ Bug fixes
redundant computation of Dask arrays with cross-group dependencies
(:issue:`10637`).
By `Stephan Hoyer <https://github.com/shoyer>`_.
- :py:meth:`DataTree.to_netcdf` had h5netcdf hard-coded as default
(:issue:`10654`).
By `Stephan Hoyer <https://github.com/shoyer>`_.


Documentation
~~~~~~~~~~~~~
Expand Down
120 changes: 40 additions & 80 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from typing import (
TYPE_CHECKING,
Any,
Final,
Literal,
TypeVar,
Union,
Expand Down Expand Up @@ -98,69 +97,44 @@
DATAARRAY_NAME = "__xarray_dataarray_name__"
DATAARRAY_VARIABLE = "__xarray_dataarray_variable__"

ENGINES = {
"netcdf4": backends.NetCDF4DataStore.open,
"scipy": backends.ScipyDataStore,
"pydap": backends.PydapDataStore.open,
"h5netcdf": backends.H5NetCDFStore.open,
"zarr": backends.ZarrStore.open_group,
}


def _get_default_engine_remote_uri() -> Literal["netcdf4", "pydap"]:
engine: Literal["netcdf4", "pydap"]
try:
import netCDF4 # noqa: F401

engine = "netcdf4"
except ImportError: # pragma: no cover
try:
import pydap # noqa: F401

engine = "pydap"
except ImportError as err:
raise ValueError(
"netCDF4 or pydap is required for accessing remote datasets via OPeNDAP"
) from err
return engine


def _get_default_engine_gz() -> Literal["scipy"]:
try:
import scipy # noqa: F401
def get_default_netcdf_write_engine(
format: T_NetcdfTypes | None,
to_fileobject_or_memoryview: bool,
) -> Literal["netcdf4", "h5netcdf", "scipy"]:
"""Return the default netCDF library to use for writing a netCDF file."""
module_names = {
"netcdf4": "netCDF4",
"scipy": "scipy",
"h5netcdf": "h5netcdf",
}

engine: Final = "scipy"
except ImportError as err: # pragma: no cover
raise ValueError("scipy is required for accessing .gz files") from err
return engine
candidates = list(plugins.NETCDF_BACKENDS_ORDER)

if format is not None:
if format.upper().startswith("NETCDF3"):
candidates.remove("h5netcdf")
elif format.upper().startswith("NETCDF4"):
candidates.remove("scipy")
else:
raise ValueError(f"unexpected {format=}")

def _get_default_engine_netcdf() -> Literal["netcdf4", "h5netcdf", "scipy"]:
candidates: list[tuple[str, str]] = [
("netcdf4", "netCDF4"),
("h5netcdf", "h5netcdf"),
("scipy", "scipy.io.netcdf"),
]
if to_fileobject_or_memoryview:
candidates.remove("netcdf4")

for engine, module_name in candidates:
for engine in candidates:
module_name = module_names[engine]
if importlib.util.find_spec(module_name) is not None:
return cast(Literal["netcdf4", "h5netcdf", "scipy"], engine)

format_str = f" with {format=}" if format is not None else ""
libraries = ", ".join(module_names[c] for c in candidates)
raise ValueError(
"cannot read or write NetCDF files because none of "
"'netCDF4-python', 'h5netcdf', or 'scipy' are installed"
f"cannot write NetCDF files{format_str} because none of the suitable "
f"backend libraries ({libraries}) are installed"
)


def _get_default_engine(path: str, allow_remote: bool = False) -> T_NetcdfEngine:
if allow_remote and is_remote_uri(path):
return _get_default_engine_remote_uri() # type: ignore[return-value]
elif path.endswith(".gz"):
return _get_default_engine_gz()
else:
return _get_default_engine_netcdf()


def _validate_dataset_names(dataset: Dataset) -> None:
"""DataArray.name and Dataset keys must be a string or None"""

Expand Down Expand Up @@ -1958,7 +1932,7 @@ def to_netcdf(
multifile: Literal[False] = False,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> bytes | memoryview: ...
) -> memoryview: ...


# compute=False returns dask.Delayed
Expand Down Expand Up @@ -2051,7 +2025,7 @@ def to_netcdf(
multifile: bool = False,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None: ...
) -> tuple[ArrayWriter, AbstractDataStore] | memoryview | Delayed | None: ...


def to_netcdf(
Expand All @@ -2067,41 +2041,22 @@ def to_netcdf(
multifile: bool = False,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None:
) -> tuple[ArrayWriter, AbstractDataStore] | memoryview | Delayed | None:
"""This function creates an appropriate datastore for writing a dataset to
disk as a netCDF file

See `Dataset.to_netcdf` for full API docs.

The ``multifile`` argument is only for the private use of save_mfdataset.
"""
if isinstance(path_or_file, os.PathLike):
path_or_file = os.fspath(path_or_file)

if encoding is None:
encoding = {}

if isinstance(path_or_file, str):
if engine is None:
engine = _get_default_engine(path_or_file)
path_or_file = _normalize_path(path_or_file)
else:
# writing to bytes/memoryview or a file-like object
if engine is None:
# TODO: only use 'scipy' if format is None or a netCDF3 format
engine = "scipy"
elif engine not in ("scipy", "h5netcdf"):
raise ValueError(
"invalid engine for creating bytes/memoryview or writing to a "
f"file-like object with to_netcdf: {engine!r}. Only "
"engine=None, engine='scipy' and engine='h5netcdf' is "
"supported."
)
if not compute:
raise NotImplementedError(
"to_netcdf() with compute=False is not yet implemented when "
"returning bytes"
)
path_or_file = _normalize_path(path_or_file)

if engine is None:
to_fileobject_or_memoryview = not isinstance(path_or_file, str)
engine = get_default_netcdf_write_engine(format, to_fileobject_or_memoryview)

# validate Dataset keys, DataArray names, and attr keys/values
_validate_dataset_names(dataset)
Expand All @@ -2121,6 +2076,11 @@ def to_netcdf(
)

if path_or_file is None:
if not compute:
raise NotImplementedError(
"to_netcdf() with compute=False is not yet implemented when "
"returning a memoryview"
)
target = BytesIOProxy()
else:
target = path_or_file # type: ignore[assignment]
Expand Down Expand Up @@ -2164,7 +2124,7 @@ def to_netcdf(

if path_or_file is None:
assert isinstance(target, BytesIOProxy) # created in this function
return target.getvalue_or_getbuffer()
return target.getbuffer()

if not compute:
return delayed_close_after_writes(writes, store)
Expand Down
14 changes: 4 additions & 10 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
TYPE_CHECKING,
Any,
ClassVar,
Generic,
Self,
TypeVar,
Union,
Expand Down Expand Up @@ -198,18 +197,13 @@ def _normalize_path_list(
return _normalize_path_list(paths)


BytesOrMemory = TypeVar("BytesOrMemory", bytes, memoryview)


@dataclass
class BytesIOProxy(Generic[BytesOrMemory]):
"""Proxy object for a write that returns either bytes or a memoryview."""
class BytesIOProxy:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: I'm keeping around BytesIOProxy because we'll need it for #10624

"""Proxy object for a write that a memoryview."""

# TODO: remove this in favor of BytesIO when Dataset.to_netcdf() stops
# returning bytes from the scipy engine
getvalue: Callable[[], BytesOrMemory] | None = None
getvalue: Callable[[], memoryview] | None = None

def getvalue_or_getbuffer(self) -> BytesOrMemory:
def getbuffer(self) -> memoryview:
"""Get the value of this write as bytes or memory."""
if self.getvalue is None:
raise ValueError("must set getvalue before fetching value")
Expand Down
4 changes: 2 additions & 2 deletions xarray/backends/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from xarray.backends.common import AbstractDataStore
from xarray.core.types import ReadBuffer

STANDARD_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"]
NETCDF_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"]


def remove_duplicates(entrypoints: EntryPoints) -> list[EntryPoint]:
Expand Down Expand Up @@ -92,7 +92,7 @@ def sort_backends(
backend_entrypoints: dict[str, type[BackendEntrypoint]],
) -> dict[str, type[BackendEntrypoint]]:
ordered_backends_entrypoints = {}
for be_name in STANDARD_BACKENDS_ORDER:
for be_name in NETCDF_BACKENDS_ORDER:
if backend_entrypoints.get(be_name) is not None:
ordered_backends_entrypoints[be_name] = backend_entrypoints.pop(be_name)
ordered_backends_entrypoints.update(
Expand Down
14 changes: 1 addition & 13 deletions xarray/backends/scipy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
Frozen,
FrozenDict,
close_on_error,
emit_user_level_warning,
module_available,
try_read_magic_number_from_file_or_path,
)
Expand Down Expand Up @@ -169,20 +168,9 @@ def __init__(
self.lock = ensure_lock(lock)

if isinstance(filename_or_obj, BytesIOProxy):
emit_user_level_warning(
"return value of to_netcdf() without a target for "
"engine='scipy' is currently bytes, but will switch to "
"memoryview in a future version of Xarray. To silence this "
"warning, use the following pattern or switch to "
"to_netcdf(engine='h5netcdf'):\n"
" target = io.BytesIO()\n"
" dataset.to_netcdf(target)\n"
" result = target.getbuffer()",
FutureWarning,
)
source = filename_or_obj
filename_or_obj = io.BytesIO()
source.getvalue = filename_or_obj.getvalue
source.getvalue = filename_or_obj.getbuffer

if isinstance(filename_or_obj, str): # path
manager = CachingFileManager(
Expand Down
16 changes: 7 additions & 9 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4067,7 +4067,7 @@ def to_netcdf(
compute: bool = True,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> bytes | memoryview: ...
) -> memoryview: ...

# compute=False returns dask.Delayed
@overload
Expand Down Expand Up @@ -4131,17 +4131,15 @@ def to_netcdf(
compute: bool = True,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> bytes | memoryview | Delayed | None:
) -> memoryview | Delayed | None:
"""Write DataArray contents to a netCDF file.

Parameters
----------
path : str, path-like or None, optional
Path to which to save this dataset. File-like objects are only
supported by the scipy engine. If no path is provided, this
function returns the resulting netCDF file as bytes; in this case,
we need to use scipy, which does not support netCDF version 4 (the
default format becomes NETCDF3_64BIT).
path : str, path-like, file-like or None, optional
Path to which to save this datatree, or a file-like object to write
it to (which must support read and write and be seekable) or None
(default) to return in-memory bytes as a memoryview.
mode : {"w", "a"}, default: "w"
Write ('w') or append ('a') mode. If mode='w', any existing file at
this location will be overwritten. If mode='a', existing variables
Expand Down Expand Up @@ -4201,7 +4199,7 @@ def to_netcdf(

Returns
-------
* ``bytes`` or ``memoryview`` if path is None
* ``memoryview`` if path is None
* ``dask.delayed.Delayed`` if compute is False
* None otherwise

Expand Down
16 changes: 7 additions & 9 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1953,7 +1953,7 @@ def to_netcdf(
compute: bool = True,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> bytes | memoryview: ...
) -> memoryview: ...

# compute=False returns dask.Delayed
@overload
Expand Down Expand Up @@ -2017,17 +2017,15 @@ def to_netcdf(
compute: bool = True,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> bytes | memoryview | Delayed | None:
) -> memoryview | Delayed | None:
"""Write dataset contents to a netCDF file.

Parameters
----------
path : str, path-like or file-like, optional
Path to which to save this dataset. File-like objects are only
supported by the scipy engine. If no path is provided, this
function returns the resulting netCDF file as bytes; in this case,
we need to use scipy, which does not support netCDF version 4 (the
default format becomes NETCDF3_64BIT).
path : str, path-like, file-like or None, optional
Path to which to save this datatree, or a file-like object to write
it to (which must support read and write and be seekable) or None
(default) to return in-memory bytes as a memoryview.
mode : {"w", "a"}, default: "w"
Write ('w') or append ('a') mode. If mode='w', any existing file at
this location will be overwritten. If mode='a', existing variables
Expand Down Expand Up @@ -2089,7 +2087,7 @@ def to_netcdf(

Returns
-------
* ``bytes`` or ``memoryview`` if path is None
* ``memoryview`` if path is None
* ``dask.delayed.Delayed`` if compute is False
* ``None`` otherwise

Expand Down
Loading
Loading