From 50d97e9d35bac783850827fa66ff5eb768e62905 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Tue, 9 Mar 2021 02:04:00 +0100 Subject: [PATCH] Switch backend API to v2 (#4989) * Switch backend API to v2 and drop v1 * Remove APIv1 functions now unused * Clean up imports * Fix code style * fix open_dataset documentation * fix open_dataarray doc * Remove stale test on removed functions * Fix entry point definition * Fix API documentation errors. * Add informative error message when using positional arguments * Add `*args` check to `open_dataarray` as well * Add whats-new entries Co-authored-by: Aureliana Barghini --- .github/workflows/ci-additional.yaml | 8 +- doc/internals.rst | 1 + doc/whats-new.rst | 16 +- xarray/backends/api.py | 579 +++++++++++++-------------- xarray/backends/apiv2.py | 286 ------------- xarray/backends/pynio_.py | 1 + xarray/core/utils.py | 6 - xarray/tests/test_backends_api.py | 5 - xarray/tests/test_utils.py | 9 - 9 files changed, 288 insertions(+), 623 deletions(-) delete mode 100644 xarray/backends/apiv2.py diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 4bf85458211..5d9c94d639c 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -42,7 +42,6 @@ jobs: "py37-min-all-deps", "py37-min-nep18", "py38-all-but-dask", - "py38-backend-api-v2", "py38-flaky", ] steps: @@ -56,12 +55,7 @@ jobs: - name: Set environment variables run: | - if [[ ${{ matrix.env }} == "py38-backend-api-v2" ]] ; - then - echo "CONDA_ENV_FILE=ci/requirements/environment.yml" >> $GITHUB_ENV - echo "XARRAY_BACKEND_API=v2" >> $GITHUB_ENV - - elif [[ ${{ matrix.env }} == "py38-flaky" ]] ; + if [[ ${{ matrix.env }} == "py38-flaky" ]] ; then echo "CONDA_ENV_FILE=ci/requirements/environment.yml" >> $GITHUB_ENV echo "PYTEST_EXTRA_FLAGS=--run-flaky --run-network-tests" >> $GITHUB_ENV diff --git a/doc/internals.rst b/doc/internals.rst index 9998043b35b..f94371731de 100644 --- a/doc/internals.rst +++ b/doc/internals.rst @@ -249,6 +249,7 @@ re-open it directly with Zarr: print(zgroup.tree()) dict(zgroup["Tair"].attrs) +.. _add_a_backend: How to add a new backend ------------------------ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9c16fb74a7b..cbafe4d5999 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,15 +27,25 @@ New Features - Support for `dask.graph_manipulation `_ (requires dask >=2021.3) By `Guido Imperiale `_ +- Thanks to the new pluggable backend infrastructure external packages may now + use the ``xarray.backends`` entry point to register additional engines to be used in + :py:func:`open_dataset`, see the documentation in :ref:`add_a_backend` + (:issue:`4309`, :issue:`4803`, :pull:`4989`, :pull:`4810` and many others). + The backend refactor has been sponsored with the "Essential Open Source Software for Science" + grant from the `Chan Zuckerberg Initiative `_ and + developed by `B-Open `_. + By `Aureliana Barghini `_ and `Alessandro Amici `_. Breaking changes ~~~~~~~~~~~~~~~~ - +- :py:func:`open_dataset` and :py:func:`open_dataarray` now accept only the first argument + as positional, all others need to be passed are keyword arguments. This is part of the + refactor to support external backends (:issue:`4309`, :pull:`4989`). + By `Alessandro Amici `_. Deprecations ~~~~~~~~~~~~ - Bug fixes ~~~~~~~~~ - Don't allow passing ``axis`` to :py:meth:`Dataset.reduce` methods (:issue:`3510`, :pull:`4940`). @@ -43,6 +53,8 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +- New section on :ref:`add_a_backend` in the "Internals" chapter aimed to backend developers + (:issue:`4803`, :pull:`4810`). By `Aureliana Barghini `_. Internal Changes diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4fa34b39925..382fad60acb 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,7 +26,8 @@ ) from ..core.dataarray import DataArray from ..core.dataset import Dataset, _get_chunk, _maybe_chunk -from ..core.utils import close_on_error, is_grib_path, is_remote_uri, read_magic_number +from ..core.utils import is_remote_uri +from . import plugins from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler @@ -70,26 +71,6 @@ def _get_default_engine_remote_uri(): return engine -def _get_default_engine_grib(): - msgs = [] - try: - import Nio # noqa: F401 - - msgs += ["set engine='pynio' to access GRIB files with PyNIO"] - except ImportError: # pragma: no cover - pass - try: - import cfgrib # noqa: F401 - - msgs += ["set engine='cfgrib' to access GRIB files with cfgrib"] - except ImportError: # pragma: no cover - pass - if msgs: - raise ValueError(" or\n".join(msgs)) - else: - raise ValueError("PyNIO or cfgrib is required for accessing GRIB files") - - def _get_default_engine_gz(): try: import scipy # noqa: F401 @@ -118,27 +99,9 @@ def _get_default_engine_netcdf(): return engine -def _get_engine_from_magic_number(filename_or_obj): - magic_number = read_magic_number(filename_or_obj) - - if magic_number.startswith(b"CDF"): - engine = "scipy" - elif magic_number.startswith(b"\211HDF\r\n\032\n"): - engine = "h5netcdf" - else: - raise ValueError( - "cannot guess the engine, " - f"{magic_number} is not the signature of any supported file format " - "did you mean to pass a string for a path instead?" - ) - return engine - - def _get_default_engine(path: str, allow_remote: bool = False): if allow_remote and is_remote_uri(path): engine = _get_default_engine_remote_uri() - elif is_grib_path(path): - engine = _get_default_engine_grib() elif path.endswith(".gz"): engine = _get_default_engine_gz() else: @@ -146,27 +109,6 @@ def _get_default_engine(path: str, allow_remote: bool = False): return engine -def _autodetect_engine(filename_or_obj): - if isinstance(filename_or_obj, AbstractDataStore): - engine = "store" - elif isinstance(filename_or_obj, (str, Path)): - engine = _get_default_engine(str(filename_or_obj), allow_remote=True) - else: - engine = _get_engine_from_magic_number(filename_or_obj) - return engine - - -def _get_backend_cls(engine, engines=ENGINES): - """Select open_dataset method based on current engine""" - try: - return engines[engine] - except KeyError: - raise ValueError( - "unrecognized engine for open_dataset: {}\n" - "must be one of: {}".format(engine, list(ENGINES)) - ) - - def _normalize_path(path): if isinstance(path, Path): path = str(path) @@ -236,6 +178,31 @@ def check_attr(name, value): check_attr(k, v) +def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders): + for d in list(decoders): + if decode_cf is False and d in open_backend_dataset_parameters: + decoders[d] = False + if decoders[d] is None: + decoders.pop(d) + return decoders + + +def _get_mtime(filename_or_obj): + # if passed an actual file path, augment the token with + # the file modification time + mtime = None + + try: + path = os.fspath(filename_or_obj) + except TypeError: + path = None + + if path and not is_remote_uri(path): + mtime = os.path.getmtime(filename_or_obj) + + return mtime + + def _protect_dataset_variables_inplace(dataset, cache): for name, variable in dataset.variables.items(): if name not in variable.dims: @@ -304,22 +271,90 @@ def load_dataarray(filename_or_obj, **kwargs): return da.load() +def _chunk_ds( + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, +): + from dask.base import tokenize + + mtime = _get_mtime(filename_or_obj) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) + name_prefix = "open_dataset-%s" % token + + variables = {} + for name, var in backend_ds.variables.items(): + var_chunks = _get_chunk(var, chunks) + variables[name] = _maybe_chunk( + name, + var, + var_chunks, + overwrite_encoded_chunks=overwrite_encoded_chunks, + name_prefix=name_prefix, + token=token, + ) + ds = backend_ds._replace(variables) + return ds + + +def _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + **extra_tokens, +): + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + _protect_dataset_variables_inplace(backend_ds, cache) + if chunks is None: + ds = backend_ds + else: + ds = _chunk_ds( + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, + ) + + ds.set_close(backend_ds._close) + + # Ensure source filename always stored in dataset object (GH issue #2550) + if "source" not in ds.encoding: + if isinstance(filename_or_obj, str): + ds.encoding["source"] = filename_or_obj + + return ds + + def open_dataset( filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, + *args, engine=None, chunks=None, - lock=None, cache=None, + decode_cf=None, + mask_and_scale=None, + decode_times=None, + decode_timedelta=None, + use_cftime=None, + concat_characters=None, + decode_coords=None, drop_variables=None, backend_kwargs=None, - use_cftime=None, - decode_timedelta=None, + **kwargs, ): """Open and decode a dataset from a file or file-like object. @@ -331,9 +366,26 @@ def open_dataset( ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + "pseudonetcdf", "zarr"}, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". + chunks : int or dict, optional + If chunks is provided, it is used to load the new dataset into dask + arrays. ``chunks=-1`` loads the dataset with dask using a single + chunk for all arrays. `chunks={}`` loads the dataset with dask using + engine preferred chunks if exposed by the backend, otherwise with + a single chunk for all arrays. + ``chunks='auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -345,15 +397,33 @@ def open_dataset( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. + pseudonetcdf backend. This keyword may not be supported by all the backends. decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + This keyword may not be supported by all the backends. + use_cftime: bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. This keyword may not be supported by all the backends. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. + This keyword may not be supported by all the backends. decode_coords : bool or {"coordinates", "all"}, optional Controls which variables are set as coordinate variables: @@ -362,54 +432,26 @@ def open_dataset( as coordinate variables. - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ - "pseudonetcdf", "zarr"}, optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". - chunks : int or dict, optional - If chunks is provided, it is used to load the new dataset into dask - arrays. ``chunks=-1`` loads the dataset with dask using a single - chunk for all arrays. `chunks={}`` loads the dataset with dask using - engine preferred chunks if exposed by the backend, otherwise with - a single chunk for all arrays. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. See dask chunking for more details. - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. drop_variables: str or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - backend_kwargs: dict, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the netCDF4 group in the given file to open given as + a str,supported by "netcdf4", "h5netcdf", "zarr". + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". + + See engine open function for kwargs accepted by each specific engine. Returns ------- @@ -427,159 +469,72 @@ def open_dataset( -------- open_mfdataset """ - if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2": - kwargs = {k: v for k, v in locals().items() if v is not None} - from . import apiv2 - - return apiv2.open_dataset(**kwargs) - - if mask_and_scale is None: - mask_and_scale = not engine == "pseudonetcdf" - - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False + if len(args) > 0: + raise TypeError( + "open_dataset() takes only 1 positional argument starting from version 0.18.0, " + "all other options must be passed as keyword arguments" + ) if cache is None: cache = chunks is None - if backend_kwargs is None: - backend_kwargs = {} - - def maybe_decode_store(store, chunks): - ds = conventions.decode_cf( - store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, - ) + if backend_kwargs is not None: + kwargs.update(backend_kwargs) - _protect_dataset_variables_inplace(ds, cache) - - if chunks is not None and engine != "zarr": - from dask.base import tokenize - - # if passed an actual file path, augment the token with - # the file modification time - if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): - mtime = os.path.getmtime(filename_or_obj) - else: - mtime = None - token = tokenize( - filename_or_obj, - mtime, - group, - decode_cf, - mask_and_scale, - decode_times, - concat_characters, - decode_coords, - engine, - chunks, - drop_variables, - use_cftime, - decode_timedelta, - ) - name_prefix = "open_dataset-%s" % token - ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) - - elif engine == "zarr": - # adapted from Dataset.Chunk() and taken from open_zarr - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None - - # auto chunking needs to be here and not in ZarrStore because - # the variable chunks does not survive decode_cf - # return trivial case - if chunks is None: - return ds - - if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) - - variables = { - k: _maybe_chunk( - k, - v, - _get_chunk(v, chunks), - overwrite_encoded_chunks=overwrite_encoded_chunks, - ) - for k, v in ds.variables.items() - } - ds2 = ds._replace(variables) - - else: - ds2 = ds - ds2.set_close(ds._close) - return ds2 - - filename_or_obj = _normalize_path(filename_or_obj) - - if isinstance(filename_or_obj, AbstractDataStore): - store = filename_or_obj - else: - if engine is None: - engine = _autodetect_engine(filename_or_obj) - - extra_kwargs = {} - if group is not None: - extra_kwargs["group"] = group - if lock is not None: - extra_kwargs["lock"] = lock - - if engine == "zarr": - backend_kwargs = backend_kwargs.copy() - overwrite_encoded_chunks = backend_kwargs.pop( - "overwrite_encoded_chunks", None - ) + if engine is None: + engine = plugins.guess_engine(filename_or_obj) - opener = _get_backend_cls(engine) - store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) + backend = plugins.get_backend(engine) - with close_on_error(store): - ds = maybe_decode_store(store, chunks) + decoders = _resolve_decoders_kwargs( + decode_cf, + open_backend_dataset_parameters=backend.open_dataset_parameters, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + backend_ds = backend.open_dataset( + filename_or_obj, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + ds = _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) return ds def open_dataarray( filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, + *args, engine=None, chunks=None, - lock=None, cache=None, + decode_cf=None, + mask_and_scale=None, + decode_times=None, + decode_timedelta=None, + use_cftime=None, + concat_characters=None, + decode_coords=None, drop_variables=None, backend_kwargs=None, - use_cftime=None, - decode_timedelta=None, + **kwargs, ): """Open an DataArray from a file or file-like object containing a single data variable. @@ -590,14 +545,31 @@ def open_dataarray( Parameters ---------- filename_or_obj : str, Path, file-like or DataStore - Strings and Paths are interpreted as a path to a netCDF file or an - OpenDAP URL and opened with python-netCDF4, unless the filename ends - with .gz, in which case the file is gunzipped and opened with + Strings and Path objects are interpreted as a path to a netCDF file + or an OpenDAP URL and opened with python-netCDF4, unless the filename + ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + "pseudonetcdf", "zarr"}, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". + chunks : int or dict, optional + If chunks is provided, it is used to load the new dataset into dask + arrays. ``chunks=-1`` loads the dataset with dask using a single + chunk for all arrays. `chunks={}`` loads the dataset with dask using + engine preferred chunks if exposed by the backend, otherwise with + a single chunk for all arrays. + ``chunks='auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -609,15 +581,33 @@ def open_dataarray( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. + pseudonetcdf backend. This keyword may not be supported by all the backends. decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + This keyword may not be supported by all the backends. + use_cftime: bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. This keyword may not be supported by all the backends. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. + This keyword may not be supported by all the backends. decode_coords : bool or {"coordinates", "all"}, optional Controls which variables are set as coordinate variables: @@ -626,51 +616,26 @@ def open_dataarray( as coordinate variables. - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib"}, \ - optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". - chunks : int or dict, optional - If chunks is provided, it used to load the new dataset into dask - arrays. - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. drop_variables: str or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - backend_kwargs: dict, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. If using fsspec URLs, - include the key "storage_options" to pass arguments to the - storage layer. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the netCDF4 group in the given file to open given as + a str,supported by "netcdf4", "h5netcdf", "zarr". + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". + + See engine open function for kwargs accepted by each specific engine. Notes ----- @@ -685,10 +650,14 @@ def open_dataarray( -------- open_dataset """ + if len(args) > 0: + raise TypeError( + "open_dataarray() takes only 1 positional argument starting from version 0.18.0, " + "all other options must be passed as keyword arguments" + ) dataset = open_dataset( filename_or_obj, - group=group, decode_cf=decode_cf, mask_and_scale=mask_and_scale, decode_times=decode_times, @@ -696,12 +665,12 @@ def open_dataarray( decode_coords=decode_coords, engine=engine, chunks=chunks, - lock=lock, cache=cache, drop_variables=drop_variables, backend_kwargs=backend_kwargs, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + **kwargs, ) if len(dataset.data_vars) != 1: @@ -734,7 +703,6 @@ def open_mfdataset( compat="no_conflicts", preprocess=None, engine=None, - lock=None, data_vars="all", coords="different", combine="by_coords", @@ -804,11 +772,6 @@ def open_mfdataset( Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for "netcdf4". - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. data_vars : {"minimal", "different", "all"} or list of str, optional These data variables will be concatenated together: * "minimal": Only data variables in which the dimension already @@ -923,7 +886,7 @@ def open_mfdataset( combined_ids_paths = _infer_concat_order_from_positions(paths) ids, paths = (list(combined_ids_paths.keys()), list(combined_ids_paths.values())) - open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, **kwargs) + open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs) if parallel: import dask diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py deleted file mode 100644 index de1b3e1bb29..00000000000 --- a/xarray/backends/apiv2.py +++ /dev/null @@ -1,286 +0,0 @@ -import os - -from ..core import indexing -from ..core.dataset import _get_chunk, _maybe_chunk -from ..core.utils import is_remote_uri -from . import plugins - - -def _protect_dataset_variables_inplace(dataset, cache): - for name, variable in dataset.variables.items(): - if name not in variable.dims: - # no need to protect IndexVariable objects - data = indexing.CopyOnWriteArray(variable._data) - if cache: - data = indexing.MemoryCachedArray(data) - variable.data = data - - -def _get_mtime(filename_or_obj): - # if passed an actual file path, augment the token with - # the file modification time - mtime = None - - try: - path = os.fspath(filename_or_obj) - except TypeError: - path = None - - if path and not is_remote_uri(path): - mtime = os.path.getmtime(filename_or_obj) - - return mtime - - -def _chunk_ds( - backend_ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - **extra_tokens, -): - from dask.base import tokenize - - mtime = _get_mtime(filename_or_obj) - token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) - name_prefix = "open_dataset-%s" % token - - variables = {} - for name, var in backend_ds.variables.items(): - var_chunks = _get_chunk(var, chunks) - variables[name] = _maybe_chunk( - name, - var, - var_chunks, - overwrite_encoded_chunks=overwrite_encoded_chunks, - name_prefix=name_prefix, - token=token, - ) - ds = backend_ds._replace(variables) - return ds - - -def _dataset_from_backend_dataset( - backend_ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - **extra_tokens, -): - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - _protect_dataset_variables_inplace(backend_ds, cache) - if chunks is None: - ds = backend_ds - else: - ds = _chunk_ds( - backend_ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - **extra_tokens, - ) - - ds.set_close(backend_ds._close) - - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj - - return ds - - -def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders): - for d in list(decoders): - if decode_cf is False and d in open_backend_dataset_parameters: - decoders[d] = False - if decoders[d] is None: - decoders.pop(d) - return decoders - - -def open_dataset( - filename_or_obj, - *, - engine=None, - chunks=None, - cache=None, - decode_cf=None, - mask_and_scale=None, - decode_times=None, - decode_timedelta=None, - use_cftime=None, - concat_characters=None, - decode_coords=None, - drop_variables=None, - backend_kwargs=None, - **kwargs, -): - """Open and decode a dataset from a file or file-like object. - - Parameters - ---------- - filename_or_obj : str, Path, file-like or DataStore - Strings and Path objects are interpreted as a path to a netCDF file - or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is unzipped and opened with - scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like - objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - engine : str, optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\ - "pynio", "cfgrib", "pseudonetcdf", "zarr"}. - chunks : int or dict, optional - If chunks is provided, it is used to load the new dataset into dask - arrays. ``chunks=-1`` loads the dataset with dask using a single - chunk for all arrays. `chunks={}`` loads the dataset with dask using - engine preferred chunks if exposed by the backend, otherwise with - a single chunk for all arrays. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. See dask chunking for more details. - cache : bool, optional - If True, cache data is loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. - decode_cf : bool, optional - Setting ``decode_cf=False`` will disable ``mask_and_scale``, - ``decode_times``, ``decode_timedelta``, ``concat_characters``, - ``decode_coords``. - mask_and_scale : bool, optional - If True, array values equal to `_FillValue` are replaced with NA and other - values are scaled according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values, a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. This keyword may not be supported by all the backends. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - This keyword may not be supported by all the backends. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, they remain encoded as numbers. - If None (default), assume the same value of decode_time. - This keyword may not be supported by all the backends. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. This keyword may not be supported by all the backends. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - This keyword may not be supported by all the backends. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - drop_variables: str or iterable, optional - A variable or list of variables to exclude from the dataset parsing. - This may be useful to drop variables with problems or - inconsistent values. - backend_kwargs: - Additional keyword arguments passed on to the engine open function. - **kwargs: dict - Additional keyword arguments passed on to the engine open function. - For example: - - - 'group': path to the netCDF4 group in the given file to open given as - a str,supported by "netcdf4", "h5netcdf", "zarr". - - - 'lock': resource lock to use when reading data from disk. Only - relevant when using dask or another form of parallelism. By default, - appropriate locks are chosen to safely read and write files with the - currently active dask scheduler. Supported by "netcdf4", "h5netcdf", - "pynio", "pseudonetcdf", "cfgrib". - - See engine open function for kwargs accepted by each specific engine. - - - Returns - ------- - dataset : Dataset - The newly created dataset. - - Notes - ----- - ``open_dataset`` opens the file with read-only access. When you modify - values of a Dataset, even one linked to files on disk, only the in-memory - copy you are manipulating in xarray is modified: the original file on disk - is never touched. - - See Also - -------- - open_mfdataset - """ - - if cache is None: - cache = chunks is None - - if backend_kwargs is not None: - kwargs.update(backend_kwargs) - - if engine is None: - engine = plugins.guess_engine(filename_or_obj) - - backend = plugins.get_backend(engine) - - decoders = _resolve_decoders_kwargs( - decode_cf, - open_backend_dataset_parameters=backend.open_dataset_parameters, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - decode_timedelta=decode_timedelta, - concat_characters=concat_characters, - use_cftime=use_cftime, - decode_coords=decode_coords, - ) - - overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) - backend_ds = backend.open_dataset( - filename_or_obj, - drop_variables=drop_variables, - **decoders, - **kwargs, - ) - ds = _dataset_from_backend_dataset( - backend_ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - drop_variables=drop_variables, - **decoders, - **kwargs, - ) - - return ds diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index cb9b9f4a8f0..ea9841da21e 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -99,6 +99,7 @@ def close(self): class PynioBackendEntrypoint(BackendEntrypoint): def open_dataset( + self, filename_or_obj, mask_and_scale=True, decode_times=True, diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 9648458ec6d..f9c8523306c 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -4,7 +4,6 @@ import functools import io import itertools -import os.path import re import warnings from enum import Enum @@ -671,11 +670,6 @@ def read_magic_number(filename_or_obj, count=8): return magic_number -def is_grib_path(path: str) -> bool: - _, ext = os.path.splitext(path) - return ext in [".grib", ".grb", ".grib2", ".grb2"] - - def is_uniform_spaced(arr, **kwargs) -> bool: """Return True if values of an array are uniformly spaced and sorted. diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index d19f5aab585..340495d4564 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -1,5 +1,3 @@ -import pytest - from xarray.backends.api import _get_default_engine from . import requires_netCDF4, requires_scipy @@ -14,8 +12,5 @@ def test__get_default_engine(): engine_gz = _get_default_engine("/example.gz") assert engine_gz == "scipy" - with pytest.raises(ValueError): - _get_default_engine("/example.grib") - engine_default = _get_default_engine("/example") assert engine_default == "netcdf4" diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 193c45f01cd..9d278a6cfb6 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -233,15 +233,6 @@ def test_is_remote_uri(): assert not utils.is_remote_uri("example.nc") -def test_is_grib_path(): - assert not utils.is_grib_path("example.nc") - assert not utils.is_grib_path("example.grib ") - assert utils.is_grib_path("example.grib") - assert utils.is_grib_path("example.grib2") - assert utils.is_grib_path("example.grb") - assert utils.is_grib_path("example.grb2") - - class Test_is_uniform_and_sorted: def test_sorted_uniform(self): assert utils.is_uniform_spaced(np.arange(5))