From 6e4d66734f63fb60b13ba25d2a7da33fbfab2b4b Mon Sep 17 00:00:00 2001 From: Aureliana Barghini <35919497+aurghs@users.noreply.github.com> Date: Fri, 12 Feb 2021 18:48:24 +0100 Subject: [PATCH 01/46] Sort backends (#4886) * style * order the plugins and tests * style * fix post merge * fix plugin_test * capitalize global variable --- xarray/backends/plugins.py | 54 ++++++++++++++++++++++-------------- xarray/tests/test_plugins.py | 34 +++++++++++++++++++++-- 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py index b8cd2bf6378..88c29306d18 100644 --- a/xarray/backends/plugins.py +++ b/xarray/backends/plugins.py @@ -8,19 +8,19 @@ from .common import BACKEND_ENTRYPOINTS +STANDARD_BACKENDS_ORDER = ["netcdf4", "h5netcdf", "scipy"] -def remove_duplicates(backend_entrypoints): + +def remove_duplicates(pkg_entrypoints): # sort and group entrypoints by name - backend_entrypoints = sorted(backend_entrypoints, key=lambda ep: ep.name) - backend_entrypoints_grouped = itertools.groupby( - backend_entrypoints, key=lambda ep: ep.name - ) + pkg_entrypoints = sorted(pkg_entrypoints, key=lambda ep: ep.name) + pkg_entrypoints_grouped = itertools.groupby(pkg_entrypoints, key=lambda ep: ep.name) # check if there are multiple entrypoints for the same name - unique_backend_entrypoints = [] - for name, matches in backend_entrypoints_grouped: + unique_pkg_entrypoints = [] + for name, matches in pkg_entrypoints_grouped: matches = list(matches) - unique_backend_entrypoints.append(matches[0]) + unique_pkg_entrypoints.append(matches[0]) matches_len = len(matches) if matches_len > 1: selected_module_name = matches[0].module_name @@ -30,7 +30,7 @@ def remove_duplicates(backend_entrypoints): f"\n {all_module_names}.\n It will be used: {selected_module_name}.", RuntimeWarning, ) - return unique_backend_entrypoints + return unique_pkg_entrypoints def detect_parameters(open_dataset): @@ -51,13 +51,13 @@ def detect_parameters(open_dataset): return tuple(parameters_list) -def create_engines_dict(backend_entrypoints): - engines = {} - for backend_ep in backend_entrypoints: - name = backend_ep.name - backend = backend_ep.load() - engines[name] = backend - return engines +def backends_dict_from_pkg(pkg_entrypoints): + backend_entrypoints = {} + for pkg_ep in pkg_entrypoints: + name = pkg_ep.name + backend = pkg_ep.load() + backend_entrypoints[name] = backend + return backend_entrypoints def set_missing_parameters(backend_entrypoints): @@ -67,11 +67,23 @@ def set_missing_parameters(backend_entrypoints): backend.open_dataset_parameters = detect_parameters(open_dataset) -def build_engines(entrypoints): +def sort_backends(backend_entrypoints): + ordered_backends_entrypoints = {} + for be_name in STANDARD_BACKENDS_ORDER: + if backend_entrypoints.get(be_name, None) is not None: + ordered_backends_entrypoints[be_name] = backend_entrypoints.pop(be_name) + ordered_backends_entrypoints.update( + {name: backend_entrypoints[name] for name in sorted(backend_entrypoints)} + ) + return ordered_backends_entrypoints + + +def build_engines(pkg_entrypoints): backend_entrypoints = BACKEND_ENTRYPOINTS.copy() - pkg_entrypoints = remove_duplicates(entrypoints) - external_backend_entrypoints = create_engines_dict(pkg_entrypoints) + pkg_entrypoints = remove_duplicates(pkg_entrypoints) + external_backend_entrypoints = backends_dict_from_pkg(pkg_entrypoints) backend_entrypoints.update(external_backend_entrypoints) + backend_entrypoints = sort_backends(backend_entrypoints) set_missing_parameters(backend_entrypoints) engines = {} for name, backend in backend_entrypoints.items(): @@ -81,8 +93,8 @@ def build_engines(entrypoints): @functools.lru_cache(maxsize=1) def list_engines(): - entrypoints = pkg_resources.iter_entry_points("xarray.backends") - return build_engines(entrypoints) + pkg_entrypoints = pkg_resources.iter_entry_points("xarray.backends") + return build_engines(pkg_entrypoints) def guess_engine(store_spec): diff --git a/xarray/tests/test_plugins.py b/xarray/tests/test_plugins.py index 64a1c563dba..0cda2901cee 100644 --- a/xarray/tests/test_plugins.py +++ b/xarray/tests/test_plugins.py @@ -58,13 +58,13 @@ def test_remove_duplicates_warnings(dummy_duplicated_entrypoints): @mock.patch("pkg_resources.EntryPoint.load", mock.MagicMock(return_value=None)) -def test_create_engines_dict(): +def test_backends_dict_from_pkg(): specs = [ "engine1 = xarray.tests.test_plugins:backend_1", "engine2 = xarray.tests.test_plugins:backend_2", ] entrypoints = [pkg_resources.EntryPoint.parse(spec) for spec in specs] - engines = plugins.create_engines_dict(entrypoints) + engines = plugins.backends_dict_from_pkg(entrypoints) assert len(engines) == 2 assert engines.keys() == set(("engine1", "engine2")) @@ -111,8 +111,38 @@ def test_build_engines(): "cfgrib = xarray.tests.test_plugins:backend_1" ) backend_entrypoints = plugins.build_engines([dummy_pkg_entrypoint]) + assert isinstance(backend_entrypoints["cfgrib"], DummyBackendEntrypoint1) assert backend_entrypoints["cfgrib"].open_dataset_parameters == ( "filename_or_obj", "decoder", ) + + +@mock.patch( + "pkg_resources.EntryPoint.load", + mock.MagicMock(return_value=DummyBackendEntrypoint1), +) +def test_build_engines_sorted(): + dummy_pkg_entrypoints = [ + pkg_resources.EntryPoint.parse( + "dummy2 = xarray.tests.test_plugins:backend_1", + ), + pkg_resources.EntryPoint.parse( + "dummy1 = xarray.tests.test_plugins:backend_1", + ), + ] + backend_entrypoints = plugins.build_engines(dummy_pkg_entrypoints) + backend_entrypoints = list(backend_entrypoints) + + indices = [] + for be in plugins.STANDARD_BACKENDS_ORDER: + try: + index = backend_entrypoints.index(be) + backend_entrypoints.pop(index) + indices.append(index) + except ValueError: + pass + + assert set(indices) < {0, -1} + assert list(backend_entrypoints) == sorted(backend_entrypoints) From 971bad7c21551f297d3e1e24e3779488ea1b9565 Mon Sep 17 00:00:00 2001 From: keewis Date: Sat, 13 Feb 2021 00:24:17 +0100 Subject: [PATCH 02/46] hide the decorator from the test traceback (#4900) --- xarray/testing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/testing.py b/xarray/testing.py index e8b5f04ef85..2129b1e1aa4 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -27,6 +27,8 @@ def ensure_warnings(func): # -> make sure that does not happen in the assert_* functions @functools.wraps(func) def wrapper(*args, **kwargs): + __tracebackhide__ = True + with warnings.catch_warnings(): warnings.simplefilter("always") From 6191dde8bdc5e709276b33e59b0cbd6184f6d465 Mon Sep 17 00:00:00 2001 From: RichardScottOZ <72196131+RichardScottOZ@users.noreply.github.com> Date: Sun, 14 Feb 2021 00:19:44 +1030 Subject: [PATCH 03/46] Update area_weighted_temperature.ipynb (#4903) grid cell consistency --- doc/examples/area_weighted_temperature.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/examples/area_weighted_temperature.ipynb b/doc/examples/area_weighted_temperature.ipynb index de705966583..7299b50b1b3 100644 --- a/doc/examples/area_weighted_temperature.ipynb +++ b/doc/examples/area_weighted_temperature.ipynb @@ -20,7 +20,7 @@ "Author: [Mathias Hauser](https://github.com/mathause/)\n", "\n", "\n", - "We use the `air_temperature` example dataset to calculate the area-weighted temperature over its domain. This dataset has a regular latitude/ longitude grid, thus the gridcell area decreases towards the pole. For this grid we can use the cosine of the latitude as proxy for the grid cell area.\n" + "We use the `air_temperature` example dataset to calculate the area-weighted temperature over its domain. This dataset has a regular latitude/ longitude grid, thus the grid cell area decreases towards the pole. For this grid we can use the cosine of the latitude as proxy for the grid cell area.\n" ] }, { From f3f0a14b2e5e3de3d5f1e7515ee8f9b751ca8fd2 Mon Sep 17 00:00:00 2001 From: keewis Date: Sun, 14 Feb 2021 00:02:56 +0100 Subject: [PATCH 04/46] replace the ci-trigger action with a external one (#4905) * use a external action instead * remove the custom action * update the action name [skip-ci] * also update the action name in ci-additional [skip-ci] * update the last mention of ci-trigger [skip-ci] --- .github/actions/detect-ci-trigger/action.yaml | 29 ------------ .github/actions/detect-ci-trigger/script.sh | 47 ------------------- .github/workflows/ci-additional.yaml | 2 +- .github/workflows/ci.yaml | 2 +- .github/workflows/upstream-dev-ci.yaml | 2 +- 5 files changed, 3 insertions(+), 79 deletions(-) delete mode 100644 .github/actions/detect-ci-trigger/action.yaml delete mode 100644 .github/actions/detect-ci-trigger/script.sh diff --git a/.github/actions/detect-ci-trigger/action.yaml b/.github/actions/detect-ci-trigger/action.yaml deleted file mode 100644 index c255d0c57cc..00000000000 --- a/.github/actions/detect-ci-trigger/action.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: Detect CI Trigger -description: | - Detect a keyword used to control the CI in the subject line of a commit message. -inputs: - keyword: - description: | - The keyword to detect. - required: true -outputs: - trigger-found: - description: | - true if the keyword has been found in the subject line of the commit message - value: ${{ steps.detect-trigger.outputs.CI_TRIGGERED }} -runs: - using: "composite" - steps: - - name: detect trigger - id: detect-trigger - run: | - bash $GITHUB_ACTION_PATH/script.sh ${{ github.event_name }} ${{ inputs.keyword }} - shell: bash - - name: show detection result - run: | - echo "::group::final summary" - echo "commit message: ${{ steps.detect-trigger.outputs.COMMIT_MESSAGE }}" - echo "trigger keyword: ${{ inputs.keyword }}" - echo "trigger found: ${{ steps.detect-trigger.outputs.CI_TRIGGERED }}" - echo "::endgroup::" - shell: bash diff --git a/.github/actions/detect-ci-trigger/script.sh b/.github/actions/detect-ci-trigger/script.sh deleted file mode 100644 index c98175a5a08..00000000000 --- a/.github/actions/detect-ci-trigger/script.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash -event_name="$1" -keyword="$2" - -echo "::group::fetch a sufficient number of commits" -echo "skipped" -# git log -n 5 2>&1 -# if [[ "$event_name" == "pull_request" ]]; then -# ref=$(git log -1 --format='%H') -# git -c protocol.version=2 fetch --deepen=2 --no-tags --prune --progress -q origin $ref 2>&1 -# git log FETCH_HEAD -# git checkout FETCH_HEAD -# else -# echo "nothing to do." -# fi -# git log -n 5 2>&1 -echo "::endgroup::" - -echo "::group::extracting the commit message" -echo "event name: $event_name" -if [[ "$event_name" == "pull_request" ]]; then - ref="HEAD^2" -else - ref="HEAD" -fi - -commit_message="$(git log -n 1 --pretty=format:%s "$ref")" - -if [[ $(echo $commit_message | wc -l) -le 1 ]]; then - echo "commit message: '$commit_message'" -else - echo -e "commit message:\n--- start ---\n$commit_message\n--- end ---" -fi -echo "::endgroup::" - -echo "::group::scanning for the keyword" -echo "searching for: '$keyword'" -if echo "$commit_message" | grep -qF "$keyword"; then - result="true" -else - result="false" -fi -echo "keyword detected: $result" -echo "::endgroup::" - -echo "::set-output name=COMMIT_MESSAGE::$commit_message" -echo "::set-output name=CI_TRIGGERED::$result" diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 3579e18dbff..92c7226f81d 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -19,7 +19,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: ./.github/actions/detect-ci-trigger + - uses: xarray-contrib/ci-trigger@v1 id: detect-trigger with: keyword: "[skip-ci]" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7d7326eb5c2..e8fd881e707 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,7 +19,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: ./.github/actions/detect-ci-trigger + - uses: xarray-contrib/ci-trigger@v1 id: detect-trigger with: keyword: "[skip-ci]" diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index dda762878c5..bba7c04a9c2 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -21,7 +21,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: ./.github/actions/detect-ci-trigger + - uses: xarray-contrib/ci-trigger@v1 id: detect-trigger with: keyword: "[test-upstream]" From aa5d9188a23e0a9c224bcb83a98b1ef4d0470675 Mon Sep 17 00:00:00 2001 From: keewis Date: Mon, 15 Feb 2021 01:20:43 +0100 Subject: [PATCH 05/46] pre-commit autoupdate CI (#4906) --- .../workflows/ci-pre-commit-autoupdate.yaml | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/ci-pre-commit-autoupdate.yaml diff --git a/.github/workflows/ci-pre-commit-autoupdate.yaml b/.github/workflows/ci-pre-commit-autoupdate.yaml new file mode 100644 index 00000000000..784fd05bcb4 --- /dev/null +++ b/.github/workflows/ci-pre-commit-autoupdate.yaml @@ -0,0 +1,41 @@ +name: "pre-commit autoupdate CI" + +on: + schedule: + - cron: "0 0 * * 0" # every Sunday at 00:00 UTC + workflow_dispatch: + + +jobs: + autoupdate: + name: 'pre-commit autoupdate' + runs-on: ubuntu-latest + if: github.repository == 'pydata/xarray' + steps: + - name: checkout + uses: actions/checkout@v2 + - name: Cache pip and pre-commit + uses: actions/cache@v2 + with: + path: | + ~/.cache/pre-commit + ~/.cache/pip + key: ${{ runner.os }}-pre-commit-autoupdate + - name: setup python + uses: actions/setup-python@v2 + - name: upgrade pip + run: python -m pip install --upgrade pip + - name: install pre-commit + run: python -m pip install --upgrade pre-commit + - name: version info + run: python -m pip list + - name: autoupdate + uses: technote-space/create-pr-action@837dbe469b39f08d416889369a52e2a993625c84 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + EXECUTE_COMMANDS: | + python -m pre_commit autoupdate + COMMIT_MESSAGE: 'pre-commit: autoupdate hook versions' + PR_TITLE: 'pre-commit: autoupdate hook versions' + PR_BRANCH_PREFIX: 'pre-commit/' + PR_BRANCH_NAME: 'autoupdate-${PR_ID}' From 735a3590ea4df70e1e5be729162df2f8774b3879 Mon Sep 17 00:00:00 2001 From: RichardScottOZ <72196131+RichardScottOZ@users.noreply.github.com> Date: Mon, 15 Feb 2021 16:09:01 +1030 Subject: [PATCH 06/46] Fix typos in example notebooks (#4908) --- doc/examples/ERA5-GRIB-example.ipynb | 2 +- doc/examples/ROMS_ocean_model.ipynb | 2 +- doc/examples/monthly-means.ipynb | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/examples/ERA5-GRIB-example.ipynb b/doc/examples/ERA5-GRIB-example.ipynb index b82a07a64e6..1c6be5f6634 100644 --- a/doc/examples/ERA5-GRIB-example.ipynb +++ b/doc/examples/ERA5-GRIB-example.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "GRIB format is commonly used to disemminate atmospheric model data. With Xarray and the cfgrib engine, GRIB data can easily be analyzed and visualized." + "GRIB format is commonly used to disseminate atmospheric model data. With Xarray and the cfgrib engine, GRIB data can easily be analyzed and visualized." ] }, { diff --git a/doc/examples/ROMS_ocean_model.ipynb b/doc/examples/ROMS_ocean_model.ipynb index 74536bbe28f..b699c4d5ba9 100644 --- a/doc/examples/ROMS_ocean_model.ipynb +++ b/doc/examples/ROMS_ocean_model.ipynb @@ -120,7 +120,7 @@ "source": [ "### A naive vertical slice\n", "\n", - "Create a slice using the s-coordinate as the vertical dimension is typically not very informative." + "Creating a slice using the s-coordinate as the vertical dimension is typically not very informative." ] }, { diff --git a/doc/examples/monthly-means.ipynb b/doc/examples/monthly-means.ipynb index bc88f4a9fc9..3490fc9a4fe 100644 --- a/doc/examples/monthly-means.ipynb +++ b/doc/examples/monthly-means.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Calculating Seasonal Averages from Timeseries of Monthly Means \n", + "Calculating Seasonal Averages from Time Series of Monthly Means \n", "=====\n", "\n", "Author: [Joe Hamman](https://github.com/jhamman/)\n", @@ -60,10 +60,10 @@ "source": [ "#### Now for the heavy lifting:\n", "We first have to come up with the weights,\n", - "- calculate the month lengths for each monthly data record\n", + "- calculate the month length for each monthly data record\n", "- calculate weights using `groupby('time.season')`\n", "\n", - "Finally, we just need to multiply our weights by the `Dataset` and sum allong the time dimension. Creating a `DataArray` for the month length is as easy as using the `days_in_month` accessor on the time coordinate. The calendar type, in this case `'noleap'`, is automatically considered in this operation." + "Finally, we just need to multiply our weights by the `Dataset` and sum along the time dimension. Creating a `DataArray` for the month length is as easy as using the `days_in_month` accessor on the time coordinate. The calendar type, in this case `'noleap'`, is automatically considered in this operation." ] }, { From 8bf415a15cc17995df5afa9af27409d903d48dcc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 16 Feb 2021 16:18:05 -0500 Subject: [PATCH 07/46] Allow fsspec URLs in open_(mf)dataset (#4823) Co-authored-by: keewis Co-authored-by: Ray Bell --- ci/requirements/environment.yml | 1 + ci/requirements/py38-all-but-dask.yml | 2 ++ doc/io.rst | 35 +++++++++++++++--- doc/whats-new.rst | 5 +++ setup.cfg | 2 ++ xarray/backends/api.py | 27 ++++++++++++-- xarray/backends/zarr.py | 16 ++++++++- xarray/core/utils.py | 7 +++- xarray/tests/__init__.py | 1 + xarray/tests/test_backends.py | 52 ++++++++++++++++++++++++++- 10 files changed, 138 insertions(+), 10 deletions(-) diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 0f59d9570c8..1bbe349ab21 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -3,6 +3,7 @@ channels: - conda-forge - nodefaults dependencies: + - aiobotocore - boto3 - bottleneck - cartopy diff --git a/ci/requirements/py38-all-but-dask.yml b/ci/requirements/py38-all-but-dask.yml index 51ec48cc6b1..07dc6344a25 100644 --- a/ci/requirements/py38-all-but-dask.yml +++ b/ci/requirements/py38-all-but-dask.yml @@ -4,6 +4,8 @@ channels: - nodefaults dependencies: - python=3.8 + - black + - aiobotocore - boto3 - bottleneck - cartopy diff --git a/doc/io.rst b/doc/io.rst index 2e46879929b..b97f1f5a699 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -890,17 +890,44 @@ Cloud Storage Buckets It is possible to read and write xarray datasets directly from / to cloud storage buckets using zarr. This example uses the `gcsfs`_ package to provide -a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then -pass to xarray:: +an interface to `Google Cloud Storage`_. + +From v0.16.2: general `fsspec`_ URLs are parsed and the store set up for you +automatically when reading, such that you can open a dataset in a single +call. You should include any arguments to the storage backend as the +key ``storage_options``, part of ``backend_kwargs``. + +.. code:: python + + ds_gcs = xr.open_dataset( + "gcs:///path.zarr", + backend_kwargs={ + "storage_options": {"project": "", "token": None} + }, + engine="zarr", + ) + + +This also works with ``open_mfdataset``, allowing you to pass a list of paths or +a URL to be interpreted as a glob string. + +For older versions, and for writing, you must explicitly set up a ``MutableMapping`` +instance and pass this, as follows: + +.. code:: python import gcsfs - fs = gcsfs.GCSFileSystem(project='', token=None) - gcsmap = gcsfs.mapping.GCSMap('', gcs=fs, check=True, create=False) + + fs = gcsfs.GCSFileSystem(project="", token=None) + gcsmap = gcsfs.mapping.GCSMap("", gcs=fs, check=True, create=False) # write to the bucket ds.to_zarr(store=gcsmap) # read it back ds_gcs = xr.open_zarr(gcsmap) +(or use the utility function ``fsspec.get_mapper()``). + +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ .. _Zarr: http://zarr.readthedocs.io/ .. _Amazon S3: https://aws.amazon.com/s3/ .. _Google Cloud Storage: https://cloud.google.com/storage/ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4b06003b630..ef4abb15129 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -76,6 +76,11 @@ New Features in the form of kwargs as well as a dict, like most similar methods. By `Maximilian Roos `_. +- :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs + (including globs for the latter) for ``engine="zarr"``, and so allow reading from + many remote and other file systems (:pull:`4461`) + By `Martin Durant `_ + Bug fixes ~~~~~~~~~ - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke `_. diff --git a/setup.cfg b/setup.cfg index 72d28d3ca6f..231865d7788 100644 --- a/setup.cfg +++ b/setup.cfg @@ -185,6 +185,8 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-distributed.*] ignore_missing_imports = True +[mypy-fsspec.*] +ignore_missing_imports = True [mypy-h5netcdf.*] ignore_missing_imports = True [mypy-h5py.*] diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 81314588784..0791d1cdaf1 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -643,7 +643,9 @@ def open_dataarray( backend_kwargs: dict, optional A dictionary of keyword arguments to pass on to the backend. This may be useful when backend options would improve performance or - allow user control of dataset processing. + allow user control of dataset processing. If using fsspec URLs, + include the key "storage_options" to pass arguments to the + storage layer. use_cftime: bool, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not @@ -869,14 +871,33 @@ def open_mfdataset( .. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance """ if isinstance(paths, str): - if is_remote_uri(paths): + if is_remote_uri(paths) and engine == "zarr": + try: + from fsspec.core import get_fs_token_paths + except ImportError as e: + raise ImportError( + "The use of remote URLs for opening zarr requires the package fsspec" + ) from e + + fs, _, _ = get_fs_token_paths( + paths, + mode="rb", + storage_options=kwargs.get("backend_kwargs", {}).get( + "storage_options", {} + ), + expand=False, + ) + paths = fs.glob(fs._strip_protocol(paths)) # finds directories + paths = [fs.get_mapper(path) for path in paths] + elif is_remote_uri(paths): raise ValueError( "cannot do wild-card matching for paths that are remote URLs: " "{!r}. Instead, supply paths as an explicit list of strings.".format( paths ) ) - paths = sorted(glob(_normalize_path(paths))) + else: + paths = sorted(glob(_normalize_path(paths))) else: paths = [str(p) if isinstance(p, Path) else p for p in paths] diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 04fdeac6450..074572169ce 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,5 +1,6 @@ import os import pathlib +from distutils.version import LooseVersion import numpy as np @@ -295,6 +296,7 @@ def open_group( consolidated=False, consolidate_on_close=False, chunk_store=None, + storage_options=None, append_dim=None, write_region=None, ): @@ -303,7 +305,15 @@ def open_group( if isinstance(store, pathlib.Path): store = os.fspath(store) - open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) + open_kwargs = dict( + mode=mode, + synchronizer=synchronizer, + path=group, + ) + if LooseVersion(zarr.__version__) >= "2.5.0": + open_kwargs["storage_options"] = storage_options + elif storage_options: + raise ValueError("Storage options only compatible with zarr>=2.5.0") if chunk_store: open_kwargs["chunk_store"] = chunk_store @@ -537,6 +547,7 @@ def open_zarr( consolidated=False, overwrite_encoded_chunks=False, chunk_store=None, + storage_options=None, decode_timedelta=None, use_cftime=None, **kwargs, @@ -649,6 +660,7 @@ def open_zarr( "consolidated": consolidated, "overwrite_encoded_chunks": overwrite_encoded_chunks, "chunk_store": chunk_store, + "storage_options": storage_options, } ds = open_dataset( @@ -687,6 +699,7 @@ def open_dataset( consolidated=False, consolidate_on_close=False, chunk_store=None, + storage_options=None, ): store = ZarrStore.open_group( filename_or_obj, @@ -696,6 +709,7 @@ def open_dataset( consolidated=consolidated, consolidate_on_close=consolidate_on_close, chunk_store=chunk_store, + storage_options=storage_options, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ced688f32dd..9648458ec6d 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -645,7 +645,12 @@ def close_on_error(f): def is_remote_uri(path: str) -> bool: - return bool(re.search(r"^https?\://", path)) + """Finds URLs of the form protocol:// or protocol:: + + This also matches for http[s]://, which were the only remote URLs + supported in <=v0.16.2. + """ + return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path)) def read_magic_number(filename_or_obj, count=8): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index a7761aefa3d..4b47e1d2c7e 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -74,6 +74,7 @@ def LooseVersion(vstring): has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") +has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3750c0715ae..75e0edb4fb2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -54,6 +54,7 @@ requires_cfgrib, requires_cftime, requires_dask, + requires_fsspec, requires_h5netcdf, requires_netCDF4, requires_pseudonetcdf, @@ -3040,10 +3041,17 @@ def test_open_mfdataset(self): with raises_regex(IOError, "no files to open"): open_mfdataset("foo-bar-baz-*.nc") - with raises_regex(ValueError, "wild-card"): open_mfdataset("http://some/remote/uri") + @requires_fsspec + def test_open_mfdataset_no_files(self): + pytest.importorskip("aiobotocore") + + # glob is attempted as of #4823, but finds no files + with raises_regex(OSError, "no files"): + open_mfdataset("http://some/remote/uri", engine="zarr") + def test_open_mfdataset_2d(self): original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) with create_tmp_file() as tmp1: @@ -4799,6 +4807,48 @@ def test_extract_zarr_variable_encoding(): ) +@requires_zarr +@requires_fsspec +def test_open_fsspec(): + import fsspec + import zarr + + if not hasattr(zarr.storage, "FSStore") or not hasattr( + zarr.storage.FSStore, "getitems" + ): + pytest.skip("zarr too old") + + ds = open_dataset(os.path.join(os.path.dirname(__file__), "data", "example_1.nc")) + + m = fsspec.filesystem("memory") + mm = m.get_mapper("out1.zarr") + ds.to_zarr(mm) # old interface + ds0 = ds.copy() + ds0["time"] = ds.time + pd.to_timedelta("1 day") + mm = m.get_mapper("out2.zarr") + ds0.to_zarr(mm) # old interface + + # single dataset + url = "memory://out2.zarr" + ds2 = open_dataset(url, engine="zarr") + assert ds0 == ds2 + + # single dataset with caching + url = "simplecache::memory://out2.zarr" + ds2 = open_dataset(url, engine="zarr") + assert ds0 == ds2 + + # multi dataset + url = "memory://out*.zarr" + ds2 = open_mfdataset(url, engine="zarr") + assert xr.concat([ds, ds0], dim="time") == ds2 + + # multi dataset with caching + url = "simplecache::memory://out*.zarr" + ds2 = open_mfdataset(url, engine="zarr") + assert xr.concat([ds, ds0], dim="time") == ds2 + + @requires_h5netcdf def test_load_single_value_h5netcdf(tmp_path): """Test that numeric single-element vector attributes are handled fine. From 2ab0666c1fcc493b1e0ebc7db14500c427f8804e Mon Sep 17 00:00:00 2001 From: Eric Keenan Date: Tue, 16 Feb 2021 16:37:29 -0700 Subject: [PATCH 08/46] Adding vectorized indexing docs (#4711) Co-authored-by: Mathias Hauser Co-authored-by: Mathias Hauser --- doc/indexing.rst | 16 ++++++++++++++ xarray/core/dataarray.py | 48 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/doc/indexing.rst b/doc/indexing.rst index 78766b8fd81..14af176c428 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -395,6 +395,22 @@ These methods may also be applied to ``Dataset`` objects ds = da.to_dataset(name="bar") ds.isel(x=xr.DataArray([0, 1, 2], dims=["points"])) +Vectorized indexing may be used to extract information from the nearest +grid cells of interest, for example, the nearest climate model grid cells +to a collection specified weather station latitudes and longitudes. + +.. ipython:: python + + ds = xr.tutorial.open_dataset("air_temperature") + + # Define target latitude and longitude (where weather stations might be) + target_lon = xr.DataArray([200, 201, 202, 205], dims="points") + target_lat = xr.DataArray([31, 41, 42, 42], dims="points") + + # Retrieve data at the grid cells nearest to the target latitudes and longitudes + da = ds["air"].sel(lon=target_lon, lat=target_lat, method="nearest") + da + .. tip:: If you are lazily loading your data from disk, not every form of vectorized diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7ca5ff50eba..34354da61e2 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1094,6 +1094,26 @@ def isel( -------- Dataset.isel DataArray.sel + + Examples + -------- + >>> da = xr.DataArray(np.arange(25).reshape(5, 5), dims=("x", "y")) + >>> da + + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14], + [15, 16, 17, 18, 19], + [20, 21, 22, 23, 24]]) + Dimensions without coordinates: x, y + + >>> tgt_x = xr.DataArray(np.arange(0, 5), dims="points") + >>> tgt_y = xr.DataArray(np.arange(0, 5), dims="points") + >>> da = da.isel(x=tgt_x, y=tgt_y) + >>> da + + array([ 0, 6, 12, 18, 24]) + Dimensions without coordinates: points """ indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "isel") @@ -1202,6 +1222,34 @@ def sel( Dataset.sel DataArray.isel + Examples + -------- + >>> da = xr.DataArray( + ... np.arange(25).reshape(5, 5), + ... coords={"x": np.arange(5), "y": np.arange(5)}, + ... dims=("x", "y"), + ... ) + >>> da + + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14], + [15, 16, 17, 18, 19], + [20, 21, 22, 23, 24]]) + Coordinates: + * x (x) int64 0 1 2 3 4 + * y (y) int64 0 1 2 3 4 + + >>> tgt_x = xr.DataArray(np.linspace(0, 4, num=5), dims="points") + >>> tgt_y = xr.DataArray(np.linspace(0, 4, num=5), dims="points") + >>> da = da.sel(x=tgt_x, y=tgt_y, method="nearest") + >>> da + + array([ 0, 6, 12, 18, 24]) + Coordinates: + x (points) int64 0 1 2 3 4 + y (points) int64 0 1 2 3 4 + Dimensions without coordinates: points """ ds = self._to_temp_dataset().sel( indexers=indexers, From cdf7761b80bf509660bbf0fe3392021aa2b5ea20 Mon Sep 17 00:00:00 2001 From: Jody Klymak Date: Wed, 17 Feb 2021 00:34:01 -0800 Subject: [PATCH 09/46] Update matplotlib's canonical (#4919) Please see: https://discourse.matplotlib.org/t/canonical-documentation-have-moved/21863 --- doc/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index 14b28b4e471..def4bb0b229 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -415,7 +415,7 @@ "numpy": ("https://numpy.org/doc/stable", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), "numba": ("https://numba.pydata.org/numba-doc/latest", None), - "matplotlib": ("https://matplotlib.org", None), + "matplotlib": ("https://matplotlib.org/stable/", None), "dask": ("https://docs.dask.org/en/latest", None), "cftime": ("https://unidata.github.io/cftime", None), "rasterio": ("https://rasterio.readthedocs.io/en/latest", None), From a8ed7edc4983d3e5441f7b223d2a66ca2eb2f137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 17 Feb 2021 09:41:20 +0100 Subject: [PATCH 10/46] FIX: h5py>=3 string decoding (#4893) * FIX: set `decode_strings=True` for h5netcdf backend, convert object string to byte string if necessary, unpin h5py * Update strings.py * Update h5netcdf_.py * fix style * FIX:change decode_strings -> decode_vlen_strings, add whats-new.rst entry * FIX: change missed decode_strings -> decode_vlen_strings * FIX: set `decode_vlen_strings=True` in `open` classmethod, call remaining tests with `decode_vlen_strings=True` * FIX: cover tests for h5py=2 --- ci/requirements/environment-windows.yml | 2 +- ci/requirements/environment.yml | 2 +- ci/requirements/py38-all-but-dask.yml | 2 +- doc/whats-new.rst | 2 ++ xarray/backends/h5netcdf_.py | 7 +++++++ xarray/tests/test_backends.py | 18 +++++++++++++++--- 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 6de2bc8dc64..fc32d35837b 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -11,7 +11,7 @@ dependencies: - dask - distributed - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - iris diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 1bbe349ab21..36147c64c03 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -13,7 +13,7 @@ dependencies: - dask - distributed - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - iris diff --git a/ci/requirements/py38-all-but-dask.yml b/ci/requirements/py38-all-but-dask.yml index 07dc6344a25..3f82990f3b5 100644 --- a/ci/requirements/py38-all-but-dask.yml +++ b/ci/requirements/py38-all-but-dask.yml @@ -14,7 +14,7 @@ dependencies: - cftime - coveralls - h5netcdf - - h5py=2 + - h5py - hdf5 - hypothesis - lxml # Optional dep of pydap diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ef4abb15129..1bca3aec68e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -118,6 +118,8 @@ Bug fixes By `Leif Denby `_. - Fix time encoding bug associated with using cftime versions greater than 1.4.0 with xarray (:issue:`4870`, :pull:`4871`). By `Spencer Clark `_. +- Fix decoding of vlen strings using h5py versions greater than 3.0.0 with h5netcdf backend (:issue:`4570`, :pull:`4893`). + By `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index aa892c4f89c..5766b34d9bd 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -131,6 +131,7 @@ def open( autoclose=False, invalid_netcdf=None, phony_dims=None, + decode_vlen_strings=True, ): if isinstance(filename, bytes): @@ -157,6 +158,10 @@ def open( "h5netcdf backend keyword argument 'phony_dims' needs " "h5netcdf >= 0.8.0." ) + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs["decode_vlen_strings"] = decode_vlen_strings if lock is None: if mode == "r": @@ -358,6 +363,7 @@ def open_dataset( lock=None, invalid_netcdf=None, phony_dims=None, + decode_vlen_strings=True, ): store = H5NetCDFStore.open( @@ -367,6 +373,7 @@ def open_dataset( lock=lock, invalid_netcdf=invalid_netcdf, phony_dims=phony_dims, + decode_vlen_strings=decode_vlen_strings, ) store_entrypoint = StoreBackendEntrypoint() diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 75e0edb4fb2..aefb91478cb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2579,13 +2579,19 @@ def test_open_dataset_group(self): v = group.createVariable("x", "int") v[...] = 42 - h5 = h5netcdf.File(tmp_file, mode="r") + kwargs = {} + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs = dict(decode_vlen_strings=True) + + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5["g"]) with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) assert_identical(expected, ds) - h5 = h5netcdf.File(tmp_file, mode="r") + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5, group="g") with open_dataset(store) as ds: expected = Dataset({"x": ((), 42)}) @@ -2600,7 +2606,13 @@ def test_deepcopy(self): v = nc.createVariable("y", np.int32, ("x",)) v[:] = np.arange(10) - h5 = h5netcdf.File(tmp_file, mode="r") + kwargs = {} + if LooseVersion(h5netcdf.__version__) >= LooseVersion( + "0.10.0" + ) and LooseVersion(h5netcdf.core.h5py.__version__) >= LooseVersion("3.0.0"): + kwargs = dict(decode_vlen_strings=True) + + h5 = h5netcdf.File(tmp_file, mode="r", **kwargs) store = backends.H5NetCDFStore(h5) with open_dataset(store) as ds: copied = ds.copy(deep=True) From 12b4480ff2bde696142ca850275cdcc85ca0fbc9 Mon Sep 17 00:00:00 2001 From: DWesl <22566757+DWesl@users.noreply.github.com> Date: Wed, 17 Feb 2021 11:35:56 -0500 Subject: [PATCH 11/46] Read grid mapping and bounds as coords (#2844) Co-authored-by: Deepak Cherian Co-authored-by: dcherian --- doc/conf.py | 2 +- doc/weather-climate.rst | 30 +++++++++ doc/whats-new.rst | 8 ++- xarray/backends/api.py | 22 +++++-- xarray/backends/apiv2.py | 12 ++-- xarray/conventions.py | 79 ++++++++++++++++++++++-- xarray/tests/test_backends.py | 113 ++++++++++++++++++++++++++++++++++ 7 files changed, 249 insertions(+), 17 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index def4bb0b229..db7229cfa61 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -34,7 +34,7 @@ subprocess.run(["conda", "list"]) else: print("pip environment:") - subprocess.run(["pip", "list"]) + subprocess.run([sys.executable, "-m", "pip", "list"]) print(f"xarray: {xarray.__version__}, {xarray.__file__}") diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst index db612d74859..068edba1e64 100644 --- a/doc/weather-climate.rst +++ b/doc/weather-climate.rst @@ -12,6 +12,36 @@ Weather and climate data .. _Climate and Forecast (CF) conventions: http://cfconventions.org +.. _cf_variables: + +Related Variables +----------------- + +Several CF variable attributes contain lists of other variables +associated with the variable with the attribute. A few of these are +now parsed by XArray, with the attribute value popped to encoding on +read and the variables in that value interpreted as non-dimension +coordinates: + +- ``coordinates`` +- ``bounds`` +- ``grid_mapping`` +- ``climatology`` +- ``geometry`` +- ``node_coordinates`` +- ``node_count`` +- ``part_node_count`` +- ``interior_ring`` +- ``cell_measures`` +- ``formula_terms`` + +This decoding is controlled by the ``decode_coords`` kwarg to +:py:func:`open_dataset` and :py:func:`open_mfdataset`. + +The CF attribute ``ancillary_variables`` was not included in the list +due to the variables listed there being associated primarily with the +variable with the attribute, rather than with the dimensions. + .. _metpy_accessor: CF-compliant coordinate variables diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1bca3aec68e..277c32b1016 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,6 +39,13 @@ Breaking changes always be set such that ``int64`` values can be used. In the past, no units finer than "seconds" were chosen, which would sometimes mean that ``float64`` values were required, which would lead to inaccurate I/O round-trips. +- Variables referred to in attributes like ``bounds`` and ``grid_mapping`` + are can be set as coordinate variables. These attributes + are moved to :py:attr:`DataArray.encoding` from + :py:attr:`DataArray.attrs`. This behaviour is controlled by the + ``decode_coords`` kwarg to :py:func:`open_dataset` and + :py:func:`open_mfdataset`. The full list of decoded attributes is in + :ref:`weather-climate` (:pull:`2844`, :issue:`3689`) - remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`). By `Aureliana Barghini `_. @@ -347,7 +354,6 @@ New Features - Expose ``use_cftime`` option in :py:func:`~xarray.open_zarr` (:issue:`2886`, :pull:`3229`) By `Samnan Rahee `_ and `Anderson Banihirwe `_. - Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0791d1cdaf1..4fa34b39925 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -354,9 +354,14 @@ def open_dataset( form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ "pseudonetcdf", "zarr"}, optional Engine to use when reading files. If not provided, the default engine @@ -613,9 +618,14 @@ def open_dataarray( form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib"}, \ optional Engine to use when reading files. If not provided, the default engine diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index d31fc9ea773..de1b3e1bb29 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -195,10 +195,14 @@ def open_dataset( removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. This keyword may not be supported by all the backends. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. This keyword may not be supported by all the - backends. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. drop_variables: str or iterable, optional A variable or list of variables to exclude from the dataset parsing. This may be useful to drop variables with problems or diff --git a/xarray/conventions.py b/xarray/conventions.py index 93e765e5622..bb0cc5cd338 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -11,6 +11,23 @@ from .core.pycompat import is_duck_dask_array from .core.variable import IndexVariable, Variable, as_variable +CF_RELATED_DATA = ( + "bounds", + "grid_mapping", + "climatology", + "geometry", + "node_coordinates", + "node_count", + "part_node_count", + "interior_ring", + "cell_measures", + "formula_terms", +) +CF_RELATED_DATA_NEEDS_PARSING = ( + "cell_measures", + "formula_terms", +) + class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): """Decode arrays on the fly from non-native to native endianness @@ -256,6 +273,9 @@ def encode_cf_variable(var, needs_copy=True, name=None): var = maybe_default_fill_value(var) var = maybe_encode_bools(var) var = ensure_dtype_not_object(var, name=name) + + for attr_name in CF_RELATED_DATA: + pop_to(var.encoding, var.attrs, attr_name) return var @@ -499,7 +519,7 @@ def stackable(dim): use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) - if decode_coords: + if decode_coords in [True, "coordinates", "all"]: var_attrs = new_vars[k].attrs if "coordinates" in var_attrs: coord_str = var_attrs["coordinates"] @@ -509,6 +529,38 @@ def stackable(dim): del var_attrs["coordinates"] coord_names.update(var_coord_names) + if decode_coords == "all": + for attr_name in CF_RELATED_DATA: + if attr_name in var_attrs: + attr_val = var_attrs[attr_name] + if attr_name not in CF_RELATED_DATA_NEEDS_PARSING: + var_names = attr_val.split() + else: + roles_and_names = [ + role_or_name + for part in attr_val.split(":") + for role_or_name in part.split() + ] + if len(roles_and_names) % 2 == 1: + warnings.warn( + f"Attribute {attr_name:s} malformed", stacklevel=5 + ) + var_names = roles_and_names[1::2] + if all(var_name in variables for var_name in var_names): + new_vars[k].encoding[attr_name] = attr_val + coord_names.update(var_names) + else: + referenced_vars_not_in_variables = [ + proj_name + for proj_name in var_names + if proj_name not in variables + ] + warnings.warn( + f"Variable(s) referenced in {attr_name:s} not in variables: {referenced_vars_not_in_variables!s}", + stacklevel=5, + ) + del var_attrs[attr_name] + if decode_coords and "coordinates" in attributes: attributes = dict(attributes) coord_names.update(attributes.pop("coordinates").split()) @@ -542,9 +594,14 @@ def decode_cf( decode_times : bool, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. - decode_coords : bool, optional - Use the 'coordinates' attribute on variable (or the dataset itself) to - identify coordinates. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. drop_variables : str or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or @@ -664,6 +721,7 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): global_coordinates = non_dim_coord_names.copy() variable_coordinates = defaultdict(set) + not_technically_coordinates = set() for coord_name in non_dim_coord_names: target_dims = variables[coord_name].dims for k, v in variables.items(): @@ -674,6 +732,13 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): ): variable_coordinates[k].add(coord_name) + if any( + attr_name in v.encoding and coord_name in v.encoding.get(attr_name) + for attr_name in CF_RELATED_DATA + ): + not_technically_coordinates.add(coord_name) + global_coordinates.discard(coord_name) + variables = {k: v.copy(deep=False) for k, v in variables.items()} # keep track of variable names written to file under the "coordinates" attributes @@ -691,7 +756,11 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): # we get support for attrs["coordinates"] for free. coords_str = pop_to(encoding, attrs, "coordinates") if not coords_str and variable_coordinates[name]: - attrs["coordinates"] = " ".join(map(str, variable_coordinates[name])) + attrs["coordinates"] = " ".join( + str(coord_name) + for coord_name in variable_coordinates[name] + if coord_name not in not_technically_coordinates + ) if "coordinates" in attrs: written_coords.update(attrs["coordinates"].split()) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index aefb91478cb..d15736e608d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -56,6 +56,7 @@ requires_dask, requires_fsspec, requires_h5netcdf, + requires_iris, requires_netCDF4, requires_pseudonetcdf, requires_pydap, @@ -858,6 +859,118 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn): assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) + @staticmethod + def _create_cf_dataset(): + original = Dataset( + dict( + variable=( + ("ln_p", "latitude", "longitude"), + np.arange(8, dtype="f4").reshape(2, 2, 2), + {"ancillary_variables": "std_devs det_lim"}, + ), + std_devs=( + ("ln_p", "latitude", "longitude"), + np.arange(0.1, 0.9, 0.1).reshape(2, 2, 2), + {"standard_name": "standard_error"}, + ), + det_lim=( + (), + 0.1, + {"standard_name": "detection_minimum"}, + ), + ), + dict( + latitude=("latitude", [0, 1], {"units": "degrees_north"}), + longitude=("longitude", [0, 1], {"units": "degrees_east"}), + latlon=((), -1, {"grid_mapping_name": "latitude_longitude"}), + latitude_bnds=(("latitude", "bnds2"), [[0, 1], [1, 2]]), + longitude_bnds=(("longitude", "bnds2"), [[0, 1], [1, 2]]), + areas=( + ("latitude", "longitude"), + [[1, 1], [1, 1]], + {"units": "degree^2"}, + ), + ln_p=( + "ln_p", + [1.0, 0.5], + { + "standard_name": "atmosphere_ln_pressure_coordinate", + "computed_standard_name": "air_pressure", + }, + ), + P0=((), 1013.25, {"units": "hPa"}), + ), + ) + original["variable"].encoding.update( + {"cell_measures": "area: areas", "grid_mapping": "latlon"}, + ) + original.coords["latitude"].encoding.update( + dict(grid_mapping="latlon", bounds="latitude_bnds") + ) + original.coords["longitude"].encoding.update( + dict(grid_mapping="latlon", bounds="longitude_bnds") + ) + original.coords["ln_p"].encoding.update({"formula_terms": "p0: P0 lev : ln_p"}) + return original + + def test_grid_mapping_and_bounds_are_not_coordinates_in_file(self): + original = self._create_cf_dataset() + with create_tmp_file() as tmp_file: + original.to_netcdf(tmp_file) + with open_dataset(tmp_file, decode_coords=False) as ds: + assert ds.coords["latitude"].attrs["bounds"] == "latitude_bnds" + assert ds.coords["longitude"].attrs["bounds"] == "longitude_bnds" + assert "latlon" not in ds["variable"].attrs["coordinates"] + assert "coordinates" not in ds.attrs + + def test_coordinate_variables_after_dataset_roundtrip(self): + original = self._create_cf_dataset() + with self.roundtrip(original, open_kwargs={"decode_coords": "all"}) as actual: + assert_identical(actual, original) + + with self.roundtrip(original) as actual: + expected = original.reset_coords( + ["latitude_bnds", "longitude_bnds", "areas", "P0", "latlon"] + ) + # equal checks that coords and data_vars are equal which + # should be enough + # identical would require resetting a number of attributes + # skip that. + assert_equal(actual, expected) + + def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip(self): + original = self._create_cf_dataset() + # The DataArray roundtrip should have the same warnings as the + # Dataset, but we already tested for those, so just go for the + # new warnings. It would appear that there is no way to tell + # pytest "This warning and also this warning should both be + # present". + # xarray/tests/test_conventions.py::TestCFEncodedDataStore + # needs the to_dataset. The other backends should be fine + # without it. + with pytest.warns( + UserWarning, + match=( + r"Variable\(s\) referenced in bounds not in variables: " + r"\['l(at|ong)itude_bnds'\]" + ), + ): + with self.roundtrip( + original["variable"].to_dataset(), open_kwargs={"decode_coords": "all"} + ) as actual: + assert_identical(actual, original["variable"].to_dataset()) + + @requires_iris + def test_coordinate_variables_after_iris_roundtrip(self): + original = self._create_cf_dataset() + iris_cube = original["variable"].to_iris() + actual = DataArray.from_iris(iris_cube) + # Bounds will be missing (xfail) + del original.coords["latitude_bnds"], original.coords["longitude_bnds"] + # Ancillary vars will be missing + # Those are data_vars, and will be dropped when grabbing the variable + assert_identical(actual, original["variable"]) + def test_coordinates_encoding(self): def equals_latlon(obj): return obj == "lat lon" or obj == "lon lat" From 7c4e2ac83f7b4306296ff9b7b51aaf016e5ad614 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 17 Feb 2021 19:12:34 +0100 Subject: [PATCH 12/46] Revert defaults of beckends' open_datasets to prepare the switch to APIv2 #4309 (#4899) --- xarray/backends/cfgrib_.py | 6 +++--- xarray/backends/h5netcdf_.py | 6 +++--- xarray/backends/netCDF4_.py | 6 +++--- xarray/backends/pseudonetcdf_.py | 6 +++--- xarray/backends/pydap_.py | 6 +++--- xarray/backends/pynio_.py | 6 +++--- xarray/backends/scipy_.py | 6 +++--- xarray/backends/zarr.py | 6 +++--- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 65c5bc2a02b..d582af82c6e 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -99,9 +99,9 @@ def open_dataset( filename_or_obj, *, mask_and_scale=True, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 5766b34d9bd..ca531af81f6 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -352,9 +352,9 @@ def open_dataset( filename_or_obj, *, mask_and_scale=True, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index e3d87aaf83f..78ad1a4c20f 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -526,9 +526,9 @@ def open_dataset( self, filename_or_obj, mask_and_scale=True, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, diff --git a/xarray/backends/pseudonetcdf_.py b/xarray/backends/pseudonetcdf_.py index 80485fce459..a9d7f0bbed4 100644 --- a/xarray/backends/pseudonetcdf_.py +++ b/xarray/backends/pseudonetcdf_.py @@ -121,9 +121,9 @@ def open_dataset( self, filename_or_obj, mask_and_scale=False, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 7f8622ca66e..09bff9acc1d 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -115,9 +115,9 @@ def open_dataset( self, filename_or_obj, mask_and_scale=True, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 41c99efd076..8ace5697d09 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -101,9 +101,9 @@ class PynioBackendEntrypoint(BackendEntrypoint): def open_dataset( filename_or_obj, mask_and_scale=True, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index ddc157ed8e4..b98515c7b5b 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -249,9 +249,9 @@ def open_dataset( self, filename_or_obj, mask_and_scale=True, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 074572169ce..d740b207e37 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -687,9 +687,9 @@ def open_dataset( self, filename_or_obj, mask_and_scale=True, - decode_times=None, - concat_characters=None, - decode_coords=None, + decode_times=True, + concat_characters=True, + decode_coords=True, drop_variables=None, use_cftime=None, decode_timedelta=None, From 9858a27f3c8a98285180dc2058e9f66e20372d54 Mon Sep 17 00:00:00 2001 From: keewis Date: Wed, 17 Feb 2021 23:40:27 +0100 Subject: [PATCH 13/46] document restrictions to the inline repr (#4912) * move the suppressed import block to the top * clarify that shape and dtype must not be included in the inline repr * show dask and sparse arrays as examples for inline reprs [skip-ci] * add sparse to the doc environment [skip-ci] * don't convert to sparse twice [skip-ci] * correctly name the variables and manually create the dask array [skip-ci] --- ci/requirements/doc.yml | 1 + doc/internals.rst | 33 +++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index e092272654b..cdb763e9748 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -23,6 +23,7 @@ dependencies: - rasterio>=1.1 - seaborn - setuptools + - sparse - sphinx=3.3 - sphinx_rtd_theme>=0.4 - sphinx-autosummary-accessors diff --git a/doc/internals.rst b/doc/internals.rst index 60d32128c60..f3d67de9077 100644 --- a/doc/internals.rst +++ b/doc/internals.rst @@ -10,6 +10,17 @@ stack, NumPy and pandas. It is written in pure Python (no C or Cython extensions), which makes it easy to develop and extend. Instead, we push compiled code to :ref:`optional dependencies`. +.. ipython:: python + :suppress: + + import dask.array as da + import numpy as np + import pandas as pd + import sparse + import xarray as xr + + np.random.seed(123456) + Variable objects ---------------- @@ -74,18 +85,24 @@ argument: ... - -Extending xarray ----------------- +To avoid duplicated information, this method must omit information about the shape and +:term:`dtype`. For example, the string representation of a ``dask`` array or a +``sparse`` matrix would be: .. ipython:: python - :suppress: - import numpy as np - import pandas as pd - import xarray as xr + a = da.linspace(0, 1, 20, chunks=2) + a - np.random.seed(123456) + b = np.eye(10) + b[[5, 7, 3, 0], [6, 8, 2, 9]] = 2 + b = sparse.COO.from_numpy(b) + b + + xr.Dataset({"a": ("x", a), "b": (("y", "z"), b)}) + +Extending xarray +---------------- xarray is designed as a general purpose library, and hence tries to avoid including overly domain specific functionality. But inevitably, the need for more From 0a309e07523c53d0ca6fb0ff5a240c638de48f1a Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 17 Feb 2021 17:28:10 -0700 Subject: [PATCH 14/46] [skip-ci] Fix some asv benchmarks (#4920) Mostly adjust to deprecated behaviour --- asv_bench/benchmarks/combine.py | 8 ++++---- asv_bench/benchmarks/dataset_io.py | 6 ------ asv_bench/benchmarks/repr.py | 2 +- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index aa9662d44f9..308ca2afda4 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -26,13 +26,13 @@ def setup(self): {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} ) - def time_combine_manual(self): + def time_combine_nested(self): datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]] - xr.combine_manual(datasets, concat_dim=[None, "t"]) + xr.combine_nested(datasets, concat_dim=[None, "T"]) - def time_auto_combine(self): + def time_combine_by_coords(self): """Also has to load and arrange t coordinate""" datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1] - xr.combine_auto(datasets) + xr.combine_by_coords(datasets) diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py index d1ffbc34706..e99911d752c 100644 --- a/asv_bench/benchmarks/dataset_io.py +++ b/asv_bench/benchmarks/dataset_io.py @@ -59,7 +59,6 @@ def make_ds(self): coords={"lon": lons, "lat": lats, "time": times}, dims=("time", "lon", "lat"), name="foo", - encoding=None, attrs={"units": "foo units", "description": "a description"}, ) self.ds["bar"] = xr.DataArray( @@ -67,7 +66,6 @@ def make_ds(self): coords={"lon": lons, "lat": lats, "time": times}, dims=("time", "lon", "lat"), name="bar", - encoding=None, attrs={"units": "bar units", "description": "a description"}, ) self.ds["baz"] = xr.DataArray( @@ -75,7 +73,6 @@ def make_ds(self): coords={"lon": lons, "lat": lats}, dims=("lon", "lat"), name="baz", - encoding=None, attrs={"units": "baz units", "description": "a description"}, ) @@ -270,7 +267,6 @@ def make_ds(self, nfiles=10): coords={"lon": lons, "lat": lats, "time": times}, dims=("time", "lon", "lat"), name="foo", - encoding=None, attrs={"units": "foo units", "description": "a description"}, ) ds["bar"] = xr.DataArray( @@ -278,7 +274,6 @@ def make_ds(self, nfiles=10): coords={"lon": lons, "lat": lats, "time": times}, dims=("time", "lon", "lat"), name="bar", - encoding=None, attrs={"units": "bar units", "description": "a description"}, ) ds["baz"] = xr.DataArray( @@ -286,7 +281,6 @@ def make_ds(self, nfiles=10): coords={"lon": lons, "lat": lats}, dims=("lon", "lat"), name="baz", - encoding=None, attrs={"units": "baz units", "description": "a description"}, ) diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py index b218c0be870..617e9313fd1 100644 --- a/asv_bench/benchmarks/repr.py +++ b/asv_bench/benchmarks/repr.py @@ -4,7 +4,7 @@ class ReprMultiIndex: - def setup(self, key): + def setup(self): index = pd.MultiIndex.from_product( [range(10000), range(10000)], names=("level_0", "level_1") ) From c9b9eec73a033e275b6012e7d391dd42591ccf52 Mon Sep 17 00:00:00 2001 From: Yunus Sevinchan Date: Thu, 18 Feb 2021 16:04:14 +0100 Subject: [PATCH 15/46] Use explicit type check in `as_compatible_data` instead of blanket access to `values` attribute (#2905) * In as_compatible_data, check explicitly for nested self-described types This change was prompted by the fact that `getattr(data, 'values', data)` affected any kind of data with a `values` attribute, which is not the desired behaviour at that point. This also extends tests to assert that custom objects with such an attribute are not attempted to be converted * Add whats-new entry * Remove trailing whitespace * In as_compatible_data, check explicitly for nested self-described types This change was prompted by the fact that `getattr(data, 'values', data)` affected any kind of data with a `values` attribute, which is not the desired behaviour at that point. This also extends tests to assert that custom objects with such an attribute are not attempted to be converted * whats-new * Fix test. * Update @blsqr github URL in whats-new * actually check that values is not extracted Co-authored-by: dcherian Co-authored-by: Keewis --- doc/whats-new.rst | 4 ++++ xarray/core/variable.py | 3 ++- xarray/tests/test_variable.py | 9 +++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 277c32b1016..1e0873c628d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -90,6 +90,10 @@ New Features Bug fixes ~~~~~~~~~ +- Use specific type checks in + :py:func:`~xarray.core.variable.as_compatible_data` instead of blanket + access to ``values`` attribute (:issue:`2097`) + By `Yunus Sevinchan `_. - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke `_. - :py:func:`merge` with ``combine_attrs='override'`` makes a copy of the attrs (:issue:`4627`). - By default, when possible, xarray will now always use values of type ``int64`` when encoding diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 45553eb9b1e..7ca90d6b3c7 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -218,7 +218,8 @@ def as_compatible_data(data, fastpath=False): data = np.timedelta64(getattr(data, "value", data), "ns") # we don't want nested self-described arrays - data = getattr(data, "values", data) + if isinstance(data, (pd.Series, pd.Index, pd.DataFrame)): + data = data.values if isinstance(data, np.ma.MaskedArray): mask = np.ma.getmaskarray(data) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index e1ae3e1f258..0d4c8662d21 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2300,6 +2300,11 @@ def __init__(self, array): class CustomIndexable(CustomArray, indexing.ExplicitlyIndexed): pass + # Type with data stored in values attribute + class CustomWithValuesAttr: + def __init__(self, array): + self.values = array + array = CustomArray(np.arange(3)) orig = Variable(dims=("x"), data=array, attrs={"foo": "bar"}) assert isinstance(orig._data, np.ndarray) # should not be CustomArray @@ -2308,6 +2313,10 @@ class CustomIndexable(CustomArray, indexing.ExplicitlyIndexed): orig = Variable(dims=("x"), data=array, attrs={"foo": "bar"}) assert isinstance(orig._data, CustomIndexable) + array = CustomWithValuesAttr(np.arange(3)) + orig = Variable(dims=(), data=array) + assert isinstance(orig._data.item(), CustomWithValuesAttr) + def test_raise_no_warning_for_nan_in_binary_ops(): with pytest.warns(None) as record: From ae0a71b757ec82ed734f070f3c2d0e61b076ca6e Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 18 Feb 2021 20:23:20 +0100 Subject: [PATCH 16/46] [skip-ci] doc: fix pynio warning (#4923) --- doc/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/io.rst b/doc/io.rst index b97f1f5a699..c2022cc9325 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -1133,7 +1133,7 @@ We recommend installing PyNIO via conda:: conda install -c conda-forge pynio - .. note:: +.. warning:: PyNIO is no longer actively maintained and conflicts with netcdf4 > 1.5.3. The PyNIO backend may be moved outside of xarray in the future. From d61efb687a9b2989ad22c70334b8cc2e32251d16 Mon Sep 17 00:00:00 2001 From: Blair Bonnett Date: Fri, 19 Feb 2021 09:12:01 +0100 Subject: [PATCH 17/46] Fix behaviour of min_count in reducing functions (#4911) * Add more tests for reducing functions with min_count * Make sure Dask-backed arrays are not computed. * Check some specific examples give the correct output. * Run membership tests on xarray.core.dtypes.NAT_TYPES * Fix behaviour of min_count in reducing functions. * Fix mask checks in xarray.core.nanops._maybe_null_out to run lazily for Dask-backed arrays. * Change xarray.core.dtypes.NAT_TYPES to a set (it is only used for membership checks). * Add dtypes to NAT_TYPES rather than instances. Previously np.float64 was returning true from `dtype in NAT_TYPES` which resulted in min_count being ignored when reducing over all axes. * Add whatsnew entry. * Improvements from review. * use duck_array_ops.where instead of np.where * add docstring and whatsnew messages about sum/prod on integer arrays with skipna=True and min_count != None now returning a float array. Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 10 +++++ xarray/core/dtypes.py | 2 +- xarray/core/nanops.py | 19 +++++---- xarray/core/ops.py | 9 ++-- xarray/tests/test_dtypes.py | 6 +++ xarray/tests/test_duck_array_ops.py | 65 ++++++++++++++++++++++++++--- 6 files changed, 94 insertions(+), 17 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1e0873c628d..f7d84b69d28 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -48,6 +48,10 @@ Breaking changes :ref:`weather-climate` (:pull:`2844`, :issue:`3689`) - remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`). By `Aureliana Barghini `_. +- As a result of :pull:`4911` the output from calling :py:meth:`DataArray.sum` + or :py:meth:`DataArray.prod` on an integer array with ``skipna=True`` and a + non-None value for ``min_count`` will now be a float array rather than an + integer array. Deprecations ~~~~~~~~~~~~ @@ -129,6 +133,12 @@ Bug fixes By `Leif Denby `_. - Fix time encoding bug associated with using cftime versions greater than 1.4.0 with xarray (:issue:`4870`, :pull:`4871`). By `Spencer Clark `_. +- Stop :py:meth:`DataArray.sum` and :py:meth:`DataArray.prod` computing lazy + arrays when called with a ``min_count`` parameter (:issue:`4898`, :pull:`4911`). + By `Blair Bonnett `_. +- Fix bug preventing the ``min_count`` parameter to :py:meth:`DataArray.sum` and + :py:meth:`DataArray.prod` working correctly when calculating over all axes of + a float64 array (:issue:`4898`, :pull:`4911`). By `Blair Bonnett `_. - Fix decoding of vlen strings using h5py versions greater than 3.0.0 with h5netcdf backend (:issue:`4570`, :pull:`4893`). By `Kai Mühlbauer `_. diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 167f00fa932..898e7e650b3 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -78,7 +78,7 @@ def maybe_promote(dtype): return np.dtype(dtype), fill_value -NAT_TYPES = (np.datetime64("NaT"), np.timedelta64("NaT")) +NAT_TYPES = {np.datetime64("NaT").dtype, np.timedelta64("NaT").dtype} def get_fill_value(dtype): diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py index 5eb88bcd096..1cfd66103a2 100644 --- a/xarray/core/nanops.py +++ b/xarray/core/nanops.py @@ -3,7 +3,14 @@ import numpy as np from . import dtypes, nputils, utils -from .duck_array_ops import _dask_or_eager_func, count, fillna, isnull, where_method +from .duck_array_ops import ( + _dask_or_eager_func, + count, + fillna, + isnull, + where, + where_method, +) from .pycompat import dask_array_type try: @@ -28,18 +35,14 @@ def _maybe_null_out(result, axis, mask, min_count=1): """ xarray version of pandas.core.nanops._maybe_null_out """ - if axis is not None and getattr(result, "ndim", False): null_mask = (np.take(mask.shape, axis).prod() - mask.sum(axis) - min_count) < 0 - if null_mask.any(): - dtype, fill_value = dtypes.maybe_promote(result.dtype) - result = result.astype(dtype) - result[null_mask] = fill_value + dtype, fill_value = dtypes.maybe_promote(result.dtype) + result = where(null_mask, fill_value, result.astype(dtype)) elif getattr(result, "dtype", None) not in dtypes.NAT_TYPES: null_mask = mask.size - mask.sum() - if null_mask < min_count: - result = np.nan + result = where(null_mask < min_count, np.nan, result) return result diff --git a/xarray/core/ops.py b/xarray/core/ops.py index d56b0d59df0..1c899115a5b 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -114,9 +114,12 @@ _MINCOUNT_DOCSTRING = """ min_count : int, default: None - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result will - be NA. New in version 0.10.8: Added with the default being None.""" + The required number of valid values to perform the operation. If + fewer than min_count non-NA values are present the result will be + NA. Only used if skipna is set to True or defaults to True for the + array's dtype. New in version 0.10.8: Added with the default being + None. Changed in version 0.17.0: if specified on an integer array + and skipna=True, the result will be a float array.""" _COARSEN_REDUCE_DOCSTRING_TEMPLATE = """\ Coarsen this object by applying `{name}` along its dimensions. diff --git a/xarray/tests/test_dtypes.py b/xarray/tests/test_dtypes.py index 5ad1a6355e6..53ed2c87133 100644 --- a/xarray/tests/test_dtypes.py +++ b/xarray/tests/test_dtypes.py @@ -90,3 +90,9 @@ def test_maybe_promote(kind, expected): actual = dtypes.maybe_promote(np.dtype(kind)) assert actual[0] == expected[0] assert str(actual[1]) == expected[1] + + +def test_nat_types_membership(): + assert np.datetime64("NaT").dtype in dtypes.NAT_TYPES + assert np.timedelta64("NaT").dtype in dtypes.NAT_TYPES + assert np.float64 not in dtypes.NAT_TYPES diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 1342950f3e5..90e742dee62 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -34,6 +34,7 @@ assert_array_equal, has_dask, has_scipy, + raise_if_dask_computes, raises_regex, requires_cftime, requires_dask, @@ -587,7 +588,10 @@ def test_min_count(dim_num, dtype, dask, func, aggdim, contains_nan, skipna): da = construct_dataarray(dim_num, dtype, contains_nan=contains_nan, dask=dask) min_count = 3 - actual = getattr(da, func)(dim=aggdim, skipna=skipna, min_count=min_count) + # If using Dask, the function call should be lazy. + with raise_if_dask_computes(): + actual = getattr(da, func)(dim=aggdim, skipna=skipna, min_count=min_count) + expected = series_reduce(da, func, skipna=skipna, dim=aggdim, min_count=min_count) assert_allclose(actual, expected) assert_dask_array(actual, dask) @@ -603,7 +607,13 @@ def test_min_count_nd(dtype, dask, func): min_count = 3 dim_num = 3 da = construct_dataarray(dim_num, dtype, contains_nan=True, dask=dask) - actual = getattr(da, func)(dim=["x", "y", "z"], skipna=True, min_count=min_count) + + # If using Dask, the function call should be lazy. + with raise_if_dask_computes(): + actual = getattr(da, func)( + dim=["x", "y", "z"], skipna=True, min_count=min_count + ) + # Supplying all dims is equivalent to supplying `...` or `None` expected = getattr(da, func)(dim=..., skipna=True, min_count=min_count) @@ -611,6 +621,48 @@ def test_min_count_nd(dtype, dask, func): assert_dask_array(actual, dask) +@pytest.mark.parametrize("dask", [False, True]) +@pytest.mark.parametrize("func", ["sum", "prod"]) +@pytest.mark.parametrize("dim", [None, "a", "b"]) +def test_min_count_specific(dask, func, dim): + if dask and not has_dask: + pytest.skip("requires dask") + + # Simple array with four non-NaN values. + da = DataArray(np.ones((6, 6), dtype=np.float64) * np.nan, dims=("a", "b")) + da[0][0] = 2 + da[0][3] = 2 + da[3][0] = 2 + da[3][3] = 2 + if dask: + da = da.chunk({"a": 3, "b": 3}) + + # Expected result if we set min_count to the number of non-NaNs in a + # row/column/the entire array. + if dim: + min_count = 2 + expected = DataArray( + [4.0, np.nan, np.nan] * 2, dims=("a" if dim == "b" else "b",) + ) + else: + min_count = 4 + expected = DataArray(8.0 if func == "sum" else 16.0) + + # Check for that min_count. + with raise_if_dask_computes(): + actual = getattr(da, func)(dim, skipna=True, min_count=min_count) + assert_dask_array(actual, dask) + assert_allclose(actual, expected) + + # With min_count being one higher, should get all NaN. + min_count += 1 + expected *= np.nan + with raise_if_dask_computes(): + actual = getattr(da, func)(dim, skipna=True, min_count=min_count) + assert_dask_array(actual, dask) + assert_allclose(actual, expected) + + @pytest.mark.parametrize("func", ["sum", "prod"]) def test_min_count_dataset(func): da = construct_dataarray(2, dtype=float, contains_nan=True, dask=False) @@ -655,9 +707,12 @@ def test_docs(): have a sentinel missing value (int) or skipna=True has not been implemented (object, datetime64 or timedelta64). min_count : int, default: None - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result will - be NA. New in version 0.10.8: Added with the default being None. + The required number of valid values to perform the operation. If + fewer than min_count non-NA values are present the result will be + NA. Only used if skipna is set to True or defaults to True for the + array's dtype. New in version 0.10.8: Added with the default being + None. Changed in version 0.17.0: if specified on an integer array + and skipna=True, the result will be a float array. keep_attrs : bool, optional If True, the attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be From 070d8158aab31c6a53efb36f9ea4a57dc6671e43 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 19 Feb 2021 07:21:44 -0700 Subject: [PATCH 18/46] Dataset.plot.quiver (#4407) Co-authored-by: Mathias Hauser --- doc/api.rst | 1 + doc/plotting.rst | 24 +++++++++++ doc/whats-new.rst | 8 ++-- xarray/plot/dataset_plot.py | 86 ++++++++++++++++++++++++++++++++++--- xarray/plot/facetgrid.py | 73 +++++++++++++++++++++++-------- xarray/plot/utils.py | 9 ++++ xarray/tests/test_plot.py | 73 ++++++++++++++++++++++++++++++- 7 files changed, 245 insertions(+), 29 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 9cb02441d37..9add7a96109 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -241,6 +241,7 @@ Plotting :template: autosummary/accessor_method.rst Dataset.plot.scatter + Dataset.plot.quiver DataArray ========= diff --git a/doc/plotting.rst b/doc/plotting.rst index 3699f794ae8..2ada3e25431 100644 --- a/doc/plotting.rst +++ b/doc/plotting.rst @@ -715,6 +715,9 @@ Consider this dataset ds +Scatter +~~~~~~~ + Suppose we want to scatter ``A`` against ``B`` .. ipython:: python @@ -762,6 +765,27 @@ Faceting is also possible For more advanced scatter plots, we recommend converting the relevant data variables to a pandas DataFrame and using the extensive plotting capabilities of ``seaborn``. +Quiver +~~~~~~ + +Visualizing vector fields is supported with quiver plots: + +.. ipython:: python + :okwarning: + + @savefig ds_simple_quiver.png + ds.isel(w=1, z=1).plot.quiver(x="x", y="y", u="A", v="B") + + +where ``u`` and ``v`` denote the x and y direction components of the arrow vectors. Again, faceting is also possible: + +.. ipython:: python + :okwarning: + + @savefig ds_facet_quiver.png + ds.plot.quiver(x="x", y="y", u="A", v="B", col="w", row="z", scale=4) + +``scale`` is required for faceted quiver plots. The scale determines the number of data units per arrow length unit, i.e. a smaller scale parameter makes the arrow longer. .. _plot-maps: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f7d84b69d28..7478ea706be 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -76,13 +76,11 @@ New Features contain missing values; 8x faster in our benchmark, and 2x faster than pandas. (:pull:`4746`); By `Maximilian Roos `_. - -- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables. - By `Deepak Cherian `_ +- Add :py:meth:`Dataset.plot.quiver` for quiver plots with :py:class:`Dataset` variables. + By `Deepak Cherian `_. - add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg (:issue:`4749`, :pull:`4827`). By `Justus Magin `_. - By `Deepak Cherian `_. - :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims in the form of kwargs as well as a dict, like most similar methods. By `Maximilian Roos `_. @@ -173,6 +171,8 @@ Internal Changes all resources. (:pull:`#4809`), By `Alessandro Amici `_. - Ensure warnings cannot be turned into exceptions in :py:func:`testing.assert_equal` and the other ``assert_*`` functions (:pull:`4864`). By `Mathias Hauser `_. +- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables. + By `Deepak Cherian `_ .. _whats-new.0.16.2: diff --git a/xarray/plot/dataset_plot.py b/xarray/plot/dataset_plot.py index 6d942e1b0fa..59d3ca98f23 100644 --- a/xarray/plot/dataset_plot.py +++ b/xarray/plot/dataset_plot.py @@ -7,6 +7,7 @@ from .facetgrid import _easy_facetgrid from .utils import ( _add_colorbar, + _get_nice_quiver_magnitude, _is_numeric, _process_cmap_cbar_kwargs, get_axis, @@ -17,7 +18,7 @@ _MARKERSIZE_RANGE = np.array([18.0, 72.0]) -def _infer_meta_data(ds, x, y, hue, hue_style, add_guide): +def _infer_meta_data(ds, x, y, hue, hue_style, add_guide, funcname): dvars = set(ds.variables.keys()) error_msg = " must be one of ({:s})".format(", ".join(dvars)) @@ -48,11 +49,24 @@ def _infer_meta_data(ds, x, y, hue, hue_style, add_guide): add_colorbar = False add_legend = False else: - if add_guide is True: + if add_guide is True and funcname != "quiver": raise ValueError("Cannot set add_guide when hue is None.") add_legend = False add_colorbar = False + if (add_guide or add_guide is None) and funcname == "quiver": + add_quiverkey = True + if hue: + add_colorbar = True + if not hue_style: + hue_style = "continuous" + elif hue_style != "continuous": + raise ValueError( + "hue_style must be 'continuous' or None for .plot.quiver" + ) + else: + add_quiverkey = False + if hue_style is not None and hue_style not in ["discrete", "continuous"]: raise ValueError("hue_style must be either None, 'discrete' or 'continuous'.") @@ -66,6 +80,7 @@ def _infer_meta_data(ds, x, y, hue, hue_style, add_guide): return { "add_colorbar": add_colorbar, "add_legend": add_legend, + "add_quiverkey": add_quiverkey, "hue_label": hue_label, "hue_style": hue_style, "xlabel": label_from_attrs(ds[x]), @@ -170,6 +185,8 @@ def _dsplot(plotfunc): ds : Dataset x, y : str Variable names for x, y axis. + u, v : str, optional + Variable names for quiver plots hue: str, optional Variable by which to color scattered points hue_style: str, optional @@ -250,6 +267,8 @@ def newplotfunc( ds, x=None, y=None, + u=None, + v=None, hue=None, hue_style=None, col=None, @@ -282,7 +301,9 @@ def newplotfunc( if _is_facetgrid: # facetgrid call meta_data = kwargs.pop("meta_data") else: - meta_data = _infer_meta_data(ds, x, y, hue, hue_style, add_guide) + meta_data = _infer_meta_data( + ds, x, y, hue, hue_style, add_guide, funcname=plotfunc.__name__ + ) hue_style = meta_data["hue_style"] @@ -317,13 +338,18 @@ def newplotfunc( else: cmap_params_subset = {} + if (u is not None or v is not None) and plotfunc.__name__ != "quiver": + raise ValueError("u, v are only allowed for quiver plots.") + primitive = plotfunc( ds=ds, x=x, y=y, + ax=ax, + u=u, + v=v, hue=hue, hue_style=hue_style, - ax=ax, cmap_params=cmap_params_subset, **kwargs, ) @@ -344,6 +370,25 @@ def newplotfunc( cbar_kwargs["label"] = meta_data.get("hue_label", None) _add_colorbar(primitive, ax, cbar_ax, cbar_kwargs, cmap_params) + if meta_data["add_quiverkey"]: + magnitude = _get_nice_quiver_magnitude(ds[u], ds[v]) + units = ds[u].attrs.get("units", "") + ax.quiverkey( + primitive, + X=0.85, + Y=0.9, + U=magnitude, + label=f"{magnitude}\n{units}", + labelpos="E", + coordinates="figure", + ) + + if plotfunc.__name__ == "quiver": + title = ds[u]._title_for_slice() + else: + title = ds[x]._title_for_slice() + ax.set_title(title) + return primitive @functools.wraps(newplotfunc) @@ -351,6 +396,8 @@ def plotmethod( _PlotMethods_obj, x=None, y=None, + u=None, + v=None, hue=None, hue_style=None, col=None, @@ -398,7 +445,7 @@ def plotmethod( @_dsplot -def scatter(ds, x, y, ax, **kwargs): +def scatter(ds, x, y, ax, u, v, **kwargs): """ Scatter Dataset data variables against each other. """ @@ -450,3 +497,32 @@ def scatter(ds, x, y, ax, **kwargs): ) return primitive + + +@_dsplot +def quiver(ds, x, y, ax, u, v, **kwargs): + """ Quiver plot with Dataset variables.""" + import matplotlib as mpl + + if x is None or y is None or u is None or v is None: + raise ValueError("Must specify x, y, u, v for quiver plots.") + + x, y, u, v = broadcast(ds[x], ds[y], ds[u], ds[v]) + + args = [x.values, y.values, u.values, v.values] + hue = kwargs.pop("hue") + cmap_params = kwargs.pop("cmap_params") + + if hue: + args.append(ds[hue].values) + + # TODO: Fix this by always returning a norm with vmin, vmax in cmap_params + if not cmap_params["norm"]: + cmap_params["norm"] = mpl.colors.Normalize( + cmap_params.pop("vmin"), cmap_params.pop("vmax") + ) + + kwargs.pop("hue_style") + kwargs.setdefault("pivot", "middle") + hdl = ax.quiver(*args, **kwargs, **cmap_params) + return hdl diff --git a/xarray/plot/facetgrid.py b/xarray/plot/facetgrid.py index bfa400d7ba4..2d3c0595026 100644 --- a/xarray/plot/facetgrid.py +++ b/xarray/plot/facetgrid.py @@ -6,6 +6,7 @@ from ..core.formatting import format_item from .utils import ( + _get_nice_quiver_magnitude, _infer_xy_labels, _process_cmap_cbar_kwargs, import_matplotlib_pyplot, @@ -195,7 +196,11 @@ def __init__( self.axes = axes self.row_names = row_names self.col_names = col_names + + # guides self.figlegend = None + self.quiverkey = None + self.cbar = None # Next the private variables self._single_group = single_group @@ -327,14 +332,15 @@ def map_dataset( from .dataset_plot import _infer_meta_data, _parse_size kwargs["add_guide"] = False - kwargs["_is_facetgrid"] = True if kwargs.get("markersize", None): kwargs["size_mapping"] = _parse_size( self.data[kwargs["markersize"]], kwargs.pop("size_norm", None) ) - meta_data = _infer_meta_data(self.data, x, y, hue, hue_style, add_guide) + meta_data = _infer_meta_data( + self.data, x, y, hue, hue_style, add_guide, funcname=func.__name__ + ) kwargs["meta_data"] = meta_data if hue and meta_data["hue_style"] == "continuous": @@ -344,6 +350,12 @@ def map_dataset( kwargs["meta_data"]["cmap_params"] = cmap_params kwargs["meta_data"]["cbar_kwargs"] = cbar_kwargs + kwargs["_is_facetgrid"] = True + + if func.__name__ == "quiver" and "scale" not in kwargs: + raise ValueError("Please provide scale.") + # TODO: come up with an algorithm for reasonable scale choice + for d, ax in zip(self.name_dicts.flat, self.axes.flat): # None is the sentinel value if d is not None: @@ -365,6 +377,9 @@ def map_dataset( elif meta_data["add_colorbar"]: self.add_colorbar(label=self._hue_label, **cbar_kwargs) + if meta_data["add_quiverkey"]: + self.add_quiverkey(kwargs["u"], kwargs["v"]) + return self def _finalize_grid(self, *axlabels): @@ -380,30 +395,22 @@ def _finalize_grid(self, *axlabels): self._finalized = True - def add_legend(self, **kwargs): - figlegend = self.fig.legend( - handles=self._mappables[-1], - labels=list(self._hue_var.values), - title=self._hue_label, - loc="center right", - **kwargs, - ) - - self.figlegend = figlegend + def _adjust_fig_for_guide(self, guide): # Draw the plot to set the bounding boxes correctly - self.fig.draw(self.fig.canvas.get_renderer()) + renderer = self.fig.canvas.get_renderer() + self.fig.draw(renderer) # Calculate and set the new width of the figure so the legend fits - legend_width = figlegend.get_window_extent().width / self.fig.dpi + guide_width = guide.get_window_extent(renderer).width / self.fig.dpi figure_width = self.fig.get_figwidth() - self.fig.set_figwidth(figure_width + legend_width) + self.fig.set_figwidth(figure_width + guide_width) # Draw the plot again to get the new transformations - self.fig.draw(self.fig.canvas.get_renderer()) + self.fig.draw(renderer) # Now calculate how much space we need on the right side - legend_width = figlegend.get_window_extent().width / self.fig.dpi - space_needed = legend_width / (figure_width + legend_width) + 0.02 + guide_width = guide.get_window_extent(renderer).width / self.fig.dpi + space_needed = guide_width / (figure_width + guide_width) + 0.02 # margin = .01 # _space_needed = margin + space_needed right = 1 - space_needed @@ -411,6 +418,16 @@ def add_legend(self, **kwargs): # Place the subplot axes to give space for the legend self.fig.subplots_adjust(right=right) + def add_legend(self, **kwargs): + self.figlegend = self.fig.legend( + handles=self._mappables[-1], + labels=list(self._hue_var.values), + title=self._hue_label, + loc="center right", + **kwargs, + ) + self._adjust_fig_for_guide(self.figlegend) + def add_colorbar(self, **kwargs): """Draw a colorbar""" kwargs = kwargs.copy() @@ -426,6 +443,26 @@ def add_colorbar(self, **kwargs): ) return self + def add_quiverkey(self, u, v, **kwargs): + kwargs = kwargs.copy() + + magnitude = _get_nice_quiver_magnitude(self.data[u], self.data[v]) + units = self.data[u].attrs.get("units", "") + self.quiverkey = self.axes.flat[-1].quiverkey( + self._mappables[-1], + X=0.8, + Y=0.9, + U=magnitude, + label=f"{magnitude}\n{units}", + labelpos="E", + coordinates="figure", + ) + + # TODO: does not work because self.quiverkey.get_window_extent(renderer) = 0 + # https://github.com/matplotlib/matplotlib/issues/18530 + # self._adjust_fig_for_guide(self.quiverkey.text) + return self + def set_axis_labels(self, x_var=None, y_var=None): """Set axis labels on the left column and bottom row of the grid.""" if x_var is not None: diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index ffe796987c5..5510cf7f219 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -841,3 +841,12 @@ def _process_cmap_cbar_kwargs( } return cmap_params, cbar_kwargs + + +def _get_nice_quiver_magnitude(u, v): + import matplotlib as mpl + + ticker = mpl.ticker.MaxNLocator(3) + mean = np.mean(np.hypot(u.values, v.values)) + magnitude = ticker.tick_values(0, mean)[-2] + return magnitude diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 47b15446f1d..705b2d5e2e7 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -1475,7 +1475,7 @@ def test_facetgrid_cbar_kwargs(self): ) # catch contour case - if hasattr(g, "cbar"): + if g.cbar is not None: assert get_colorbar_label(g.cbar) == "test_label" def test_facetgrid_no_cbar_ax(self): @@ -2152,6 +2152,66 @@ def test_wrong_num_of_dimensions(self): self.darray.plot.line(row="row", hue="hue") +@requires_matplotlib +class TestDatasetQuiverPlots(PlotTestCase): + @pytest.fixture(autouse=True) + def setUp(self): + das = [ + DataArray( + np.random.randn(3, 3, 4, 4), + dims=["x", "y", "row", "col"], + coords=[range(k) for k in [3, 3, 4, 4]], + ) + for _ in [1, 2] + ] + ds = Dataset({"u": das[0], "v": das[1]}) + ds.x.attrs["units"] = "xunits" + ds.y.attrs["units"] = "yunits" + ds.col.attrs["units"] = "colunits" + ds.row.attrs["units"] = "rowunits" + ds.u.attrs["units"] = "uunits" + ds.v.attrs["units"] = "vunits" + ds["mag"] = np.hypot(ds.u, ds.v) + self.ds = ds + + def test_quiver(self): + with figure_context(): + hdl = self.ds.isel(row=0, col=0).plot.quiver(x="x", y="y", u="u", v="v") + assert isinstance(hdl, mpl.quiver.Quiver) + with raises_regex(ValueError, "specify x, y, u, v"): + self.ds.isel(row=0, col=0).plot.quiver(x="x", y="y", u="u") + + with raises_regex(ValueError, "hue_style"): + self.ds.isel(row=0, col=0).plot.quiver( + x="x", y="y", u="u", v="v", hue="mag", hue_style="discrete" + ) + + def test_facetgrid(self): + with figure_context(): + fg = self.ds.plot.quiver( + x="x", y="y", u="u", v="v", row="row", col="col", scale=1, hue="mag" + ) + for handle in fg._mappables: + assert isinstance(handle, mpl.quiver.Quiver) + assert "uunits" in fg.quiverkey.text.get_text() + + with figure_context(): + fg = self.ds.plot.quiver( + x="x", + y="y", + u="u", + v="v", + row="row", + col="col", + scale=1, + hue="mag", + add_guide=False, + ) + assert fg.quiverkey is None + with raises_regex(ValueError, "Please provide scale"): + self.ds.plot.quiver(x="x", y="y", u="u", v="v", row="row", col="col") + + @requires_matplotlib class TestDatasetScatterPlots(PlotTestCase): @pytest.fixture(autouse=True) @@ -2194,7 +2254,13 @@ def test_accessor(self): def test_add_guide(self, add_guide, hue_style, legend, colorbar): meta_data = _infer_meta_data( - self.ds, x="A", y="B", hue="hue", hue_style=hue_style, add_guide=add_guide + self.ds, + x="A", + y="B", + hue="hue", + hue_style=hue_style, + add_guide=add_guide, + funcname="scatter", ) assert meta_data["add_legend"] is legend assert meta_data["add_colorbar"] is colorbar @@ -2273,6 +2339,9 @@ def test_facetgrid_hue_style(self): def test_scatter(self, x, y, hue, markersize): self.ds.plot.scatter(x, y, hue=hue, markersize=markersize) + with raises_regex(ValueError, "u, v"): + self.ds.plot.scatter(x, y, u="col", v="row") + def test_non_numeric_legend(self): ds2 = self.ds.copy() ds2["hue"] = ["a", "b", "c", "d"] From 9a4313b4b75c181eade5a61f1a2f053b9d1bb293 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 19 Feb 2021 12:44:04 -0700 Subject: [PATCH 19/46] Better rolling reductions (#4915) --- asv_bench/benchmarks/rolling.py | 41 ++++++++++++ doc/whats-new.rst | 2 + xarray/core/dtypes.py | 24 +++++-- xarray/core/rolling.py | 114 ++++++++++++++++++++++++++------ xarray/tests/test_dataarray.py | 10 +++ 5 files changed, 166 insertions(+), 25 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index d5426af4aa1..93c3c6aed4e 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -67,3 +67,44 @@ def setup(self, *args, **kwargs): super().setup(**kwargs) self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) self.da_long = self.da_long.chunk({"x": 10000}) + + +class RollingMemory: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + +class DataArrayRollingMemory(RollingMemory): + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_ndrolling_reduce(self, func): + roll = self.ds.var1.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_1drolling_reduce(self, func): + roll = self.ds.var3.rolling(t=100) + getattr(roll, func)() + + +class DatasetRollingMemory(RollingMemory): + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_ndrolling_reduce(self, func): + roll = self.ds.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_1drolling_reduce(self, func): + roll = self.ds.rolling(t=100) + getattr(roll, func)() diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7478ea706be..22902963d9c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -68,6 +68,8 @@ New Features - Xarray now leverages updates as of cftime version 1.4.1, which enable exact I/O roundtripping of ``cftime.datetime`` objects (:pull:`4758`). By `Spencer Clark `_. +- Most rolling operations use significantly less memory. (:issue:`4325`). + By `Deepak Cherian `_. - :py:meth:`~xarray.cftime_range` and :py:meth:`DataArray.resample` now support millisecond (``"L"`` or ``"ms"``) and microsecond (``"U"`` or ``"us"``) frequencies for ``cftime.datetime`` coordinates (:issue:`4097`, :pull:`4758`). diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 898e7e650b3..51499c3a687 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -96,40 +96,56 @@ def get_fill_value(dtype): return fill_value -def get_pos_infinity(dtype): +def get_pos_infinity(dtype, max_for_int=False): """Return an appropriate positive infinity for this dtype. Parameters ---------- dtype : np.dtype + max_for_int : bool + Return np.iinfo(dtype).max instead of np.inf Returns ------- fill_value : positive infinity value corresponding to this dtype. """ - if issubclass(dtype.type, (np.floating, np.integer)): + if issubclass(dtype.type, np.floating): return np.inf + if issubclass(dtype.type, np.integer): + if max_for_int: + return np.iinfo(dtype).max + else: + return np.inf + if issubclass(dtype.type, np.complexfloating): return np.inf + 1j * np.inf return INF -def get_neg_infinity(dtype): +def get_neg_infinity(dtype, min_for_int=False): """Return an appropriate positive infinity for this dtype. Parameters ---------- dtype : np.dtype + min_for_int : bool + Return np.iinfo(dtype).min instead of -np.inf Returns ------- fill_value : positive infinity value corresponding to this dtype. """ - if issubclass(dtype.type, (np.floating, np.integer)): + if issubclass(dtype.type, np.floating): return -np.inf + if issubclass(dtype.type, np.integer): + if min_for_int: + return np.iinfo(dtype).min + else: + return -np.inf + if issubclass(dtype.type, np.complexfloating): return -np.inf - 1j * np.inf diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index f25d798d9f1..6087fd4c806 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -111,8 +111,14 @@ def __repr__(self): def __len__(self): return self.obj.sizes[self.dim] - def _reduce_method(name: str) -> Callable: # type: ignore - array_agg_func = getattr(duck_array_ops, name) + def _reduce_method(name: str, fillna, rolling_agg_func: Callable = None) -> Callable: # type: ignore + """Constructs reduction methods built on a numpy reduction function (e.g. sum), + a bottleneck reduction function (e.g. move_sum), or a Rolling reduction (_mean).""" + if rolling_agg_func: + array_agg_func = None + else: + array_agg_func = getattr(duck_array_ops, name) + bottleneck_move_func = getattr(bottleneck, "move_" + name, None) def method(self, keep_attrs=None, **kwargs): @@ -120,23 +126,36 @@ def method(self, keep_attrs=None, **kwargs): keep_attrs = self._get_keep_attrs(keep_attrs) return self._numpy_or_bottleneck_reduce( - array_agg_func, bottleneck_move_func, keep_attrs=keep_attrs, **kwargs + array_agg_func, + bottleneck_move_func, + rolling_agg_func, + keep_attrs=keep_attrs, + fillna=fillna, + **kwargs, ) method.__name__ = name method.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(name=name) return method - argmax = _reduce_method("argmax") - argmin = _reduce_method("argmin") - max = _reduce_method("max") - min = _reduce_method("min") - mean = _reduce_method("mean") - prod = _reduce_method("prod") - sum = _reduce_method("sum") - std = _reduce_method("std") - var = _reduce_method("var") - median = _reduce_method("median") + def _mean(self, keep_attrs, **kwargs): + result = self.sum(keep_attrs=False, **kwargs) / self.count(keep_attrs=False) + if keep_attrs: + result.attrs = self.obj.attrs + return result + + _mean.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(name="mean") + + argmax = _reduce_method("argmax", dtypes.NINF) + argmin = _reduce_method("argmin", dtypes.INF) + max = _reduce_method("max", dtypes.NINF) + min = _reduce_method("min", dtypes.INF) + prod = _reduce_method("prod", 1) + sum = _reduce_method("sum", 0) + mean = _reduce_method("mean", None, _mean) + std = _reduce_method("std", None) + var = _reduce_method("var", None) + median = _reduce_method("median", None) def count(self, keep_attrs=None): keep_attrs = self._get_keep_attrs(keep_attrs) @@ -301,6 +320,24 @@ def construct( """ + return self._construct( + self.obj, + window_dim=window_dim, + stride=stride, + fill_value=fill_value, + keep_attrs=keep_attrs, + **window_dim_kwargs, + ) + + def _construct( + self, + obj, + window_dim=None, + stride=1, + fill_value=dtypes.NA, + keep_attrs=None, + **window_dim_kwargs, + ): from .dataarray import DataArray keep_attrs = self._get_keep_attrs(keep_attrs) @@ -317,18 +354,18 @@ def construct( ) stride = self._mapping_to_list(stride, default=1) - window = self.obj.variable.rolling_window( + window = obj.variable.rolling_window( self.dim, self.window, window_dim, self.center, fill_value=fill_value ) - attrs = self.obj.attrs if keep_attrs else {} + attrs = obj.attrs if keep_attrs else {} result = DataArray( window, - dims=self.obj.dims + tuple(window_dim), - coords=self.obj.coords, + dims=obj.dims + tuple(window_dim), + coords=obj.coords, attrs=attrs, - name=self.obj.name, + name=obj.name, ) return result.isel( **{d: slice(None, None, s) for d, s in zip(self.dim, stride)} @@ -393,7 +430,18 @@ def reduce(self, func, keep_attrs=None, **kwargs): d: utils.get_temp_dimname(self.obj.dims, f"_rolling_dim_{d}") for d in self.dim } - windows = self.construct(rolling_dim, keep_attrs=keep_attrs) + + # save memory with reductions GH4325 + fillna = kwargs.pop("fillna", dtypes.NA) + if fillna is not dtypes.NA: + obj = self.obj.fillna(fillna) + else: + obj = self.obj + + windows = self._construct( + obj, rolling_dim, keep_attrs=keep_attrs, fill_value=fillna + ) + result = windows.reduce( func, dim=list(rolling_dim.values()), keep_attrs=keep_attrs, **kwargs ) @@ -470,7 +518,13 @@ def _bottleneck_reduce(self, func, keep_attrs, **kwargs): return DataArray(values, self.obj.coords, attrs=attrs, name=self.obj.name) def _numpy_or_bottleneck_reduce( - self, array_agg_func, bottleneck_move_func, keep_attrs, **kwargs + self, + array_agg_func, + bottleneck_move_func, + rolling_agg_func, + keep_attrs, + fillna, + **kwargs, ): if "dim" in kwargs: warnings.warn( @@ -494,6 +548,18 @@ def _numpy_or_bottleneck_reduce( bottleneck_move_func, keep_attrs=keep_attrs, **kwargs ) else: + if rolling_agg_func: + return rolling_agg_func( + self, keep_attrs=self._get_keep_attrs(keep_attrs) + ) + if fillna is not None: + if fillna is dtypes.INF: + fillna = dtypes.get_pos_infinity(self.obj.dtype, max_for_int=True) + elif fillna is dtypes.NINF: + fillna = dtypes.get_neg_infinity(self.obj.dtype, min_for_int=True) + kwargs.setdefault("skipna", False) + kwargs.setdefault("fillna", fillna) + return self.reduce(array_agg_func, keep_attrs=keep_attrs, **kwargs) @@ -600,13 +666,19 @@ def _counts(self, keep_attrs): ) def _numpy_or_bottleneck_reduce( - self, array_agg_func, bottleneck_move_func, keep_attrs, **kwargs + self, + array_agg_func, + bottleneck_move_func, + rolling_agg_func, + keep_attrs, + **kwargs, ): return self._dataset_implementation( functools.partial( DataArrayRolling._numpy_or_bottleneck_reduce, array_agg_func=array_agg_func, bottleneck_move_func=bottleneck_move_func, + rolling_agg_func=rolling_agg_func, ), keep_attrs=keep_attrs, **kwargs, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8d599c7a715..e5cf0bfbc48 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6623,6 +6623,16 @@ def test_ndrolling_reduce(da, center, min_periods, name): assert_allclose(actual, expected) assert actual.dims == expected.dims + if name in ["mean"]: + # test our reimplementation of nanmean using np.nanmean + expected = getattr(rolling_obj.construct({"time": "tw", "x": "xw"}), name)( + ["tw", "xw"] + ) + count = rolling_obj.count() + if min_periods is None: + min_periods = 1 + assert_allclose(actual, expected.where(count >= min_periods)) + @pytest.mark.parametrize("center", (True, False, (True, False))) @pytest.mark.parametrize("fill_value", (np.nan, 0.0)) From 7e4ccfb699f63daac062b192c94febd8e8c35922 Mon Sep 17 00:00:00 2001 From: Chun Ho Chow <15370962+chunhochow@users.noreply.github.com> Date: Fri, 19 Feb 2021 15:13:57 -0800 Subject: [PATCH 20/46] variable.py as_compatible_data: disallow DataArrays (#4493) * variable.py as_compatible_data: allow DataArrays too * variable.py: as_compatible_data: DataArray raise TypeError tell user to use da.data * variable.py: as_compatible_data: debug syntax * variable.py: update no .data error msg * move err msg to as_variable * whats-new changelog * stylistic debug * whats-new.rst move changelog to new unreleased version * debug formatting whats-new.rst * add .data to tests using tuples * fix test_dask.py test * fix test_dataset * raise DeprecationWarning instead of TypeError * add .data to the example in plotting * state when the warning will be turned into an error * small fix [skip-ci] * check that as_variable warns correctly * warn instead of raising the warning * fix the test * manually extract the data * use a single statement to modify obj * remove the extraction of the data altogether Co-authored-by: chunhochow Co-authored-by: Keewis --- doc/plotting.rst | 2 +- doc/whats-new.rst | 3 +++ xarray/core/variable.py | 10 ++++++++++ xarray/tests/test_dask.py | 2 +- xarray/tests/test_dataset.py | 4 ++-- xarray/tests/test_interp.py | 2 +- xarray/tests/test_variable.py | 5 ++++- 7 files changed, 22 insertions(+), 6 deletions(-) diff --git a/doc/plotting.rst b/doc/plotting.rst index 2ada3e25431..f5f1168df23 100644 --- a/doc/plotting.rst +++ b/doc/plotting.rst @@ -227,7 +227,7 @@ from the time and assign it as a non-dimension coordinate: :okwarning: decimal_day = (air1d.time - air1d.time[0]) / pd.Timedelta("1d") - air1d_multi = air1d.assign_coords(decimal_day=("time", decimal_day)) + air1d_multi = air1d.assign_coords(decimal_day=("time", decimal_day.data)) air1d_multi To use ``'decimal_day'`` as x coordinate it must be explicitly specified: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 22902963d9c..3e3865b244d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -128,6 +128,9 @@ Bug fixes - Expand user directory paths (e.g. ``~/``) in :py:func:`open_mfdataset` and :py:meth:`Dataset.to_zarr` (:issue:`4783`, :pull:`4795`). By `Julien Seguinot `_. +- Raise DeprecationWarning when trying to typecast a tuple containing a :py:class:`DataArray`. + User now prompted to first call `.data` on it (:issue:`4483`). + By `Chun Ho Chow `_. - Add :py:meth:`Dataset.drop_isel` and :py:meth:`DataArray.drop_isel` (:issue:`4658`, :pull:`4819`). By `Daniel Mesejo `_. - Ensure that :py:meth:`Dataset.interp` raises ``ValueError`` when interpolating outside coordinate range and ``bounds_error=True`` (:issue:`4854`, :pull:`4855`). By `Leif Denby `_. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7ca90d6b3c7..9b70f721689 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -120,6 +120,16 @@ def as_variable(obj, name=None) -> "Union[Variable, IndexVariable]": if isinstance(obj, Variable): obj = obj.copy(deep=False) elif isinstance(obj, tuple): + if isinstance(obj[1], DataArray): + # TODO: change into TypeError + warnings.warn( + ( + "Using a DataArray object to construct a variable is" + " ambiguous, please extract the data using the .data property." + " This will raise a TypeError in 0.19.0." + ), + DeprecationWarning, + ) try: obj = Variable(*obj) except (TypeError, ValueError) as error: diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 19a61c60577..8220c8b83dc 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1233,7 +1233,7 @@ def test_map_blocks_to_array(map_ds): lambda x: x.drop_vars("x"), lambda x: x.expand_dims(k=[1, 2, 3]), lambda x: x.expand_dims(k=3), - lambda x: x.assign_coords(new_coord=("y", x.y * 2)), + lambda x: x.assign_coords(new_coord=("y", x.y.data * 2)), lambda x: x.astype(np.int32), lambda x: x.x, ], diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index db47faa8d2b..c4161093837 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4959,13 +4959,13 @@ def test_reduce_keepdims(self): # Coordinates involved in the reduction should be removed actual = ds.mean(keepdims=True) expected = Dataset( - {"a": (["x", "y"], np.mean(ds.a, keepdims=True))}, coords={"c": ds.c} + {"a": (["x", "y"], np.mean(ds.a, keepdims=True).data)}, coords={"c": ds.c} ) assert_identical(expected, actual) actual = ds.mean("x", keepdims=True) expected = Dataset( - {"a": (["x", "y"], np.mean(ds.a, axis=0, keepdims=True))}, + {"a": (["x", "y"], np.mean(ds.a, axis=0, keepdims=True).data)}, coords={"y": ds.y, "c": ds.c}, ) assert_identical(expected, actual) diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index cdfc46bbedf..9212f870009 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -190,7 +190,7 @@ def func(obj, dim, new_x): "w": xdest["w"], "z2": xdest["z2"], "y": da["y"], - "x": (("z", "w"), xdest), + "x": (("z", "w"), xdest.data), "x2": (("z", "w"), func(da["x2"], "x", xdest)), }, ) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 0d4c8662d21..f9ef8f57ef9 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -8,7 +8,7 @@ import pytest import pytz -from xarray import Coordinate, Dataset, IndexVariable, Variable, set_options +from xarray import Coordinate, DataArray, Dataset, IndexVariable, Variable, set_options from xarray.core import dtypes, duck_array_ops, indexing from xarray.core.common import full_like, ones_like, zeros_like from xarray.core.indexing import ( @@ -1081,6 +1081,9 @@ def test_as_variable(self): td = np.array([timedelta(days=x) for x in range(10)]) assert as_variable(td, "time").dtype.kind == "m" + with pytest.warns(DeprecationWarning): + as_variable(("x", DataArray([]))) + def test_repr(self): v = Variable(["time", "x"], [[1, 2, 3], [4, 5, 6]], {"foo": "bar"}) expected = dedent( From c4ad6f1caa40c8b2119e2f68786ff471cf7d89ed Mon Sep 17 00:00:00 2001 From: keewis Date: Sat, 20 Feb 2021 01:07:25 +0100 Subject: [PATCH 21/46] unpin sphinx (#4931) --- ci/requirements/doc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index cdb763e9748..47b3cb9eb70 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -24,7 +24,7 @@ dependencies: - seaborn - setuptools - sparse - - sphinx=3.3 + - sphinx>=3.3 - sphinx_rtd_theme>=0.4 - sphinx-autosummary-accessors - zarr>=2.4 From eb7e112d45a9edebd8e5fb4f873e3e6adb18824a Mon Sep 17 00:00:00 2001 From: ghislainp Date: Sat, 20 Feb 2021 01:08:42 +0100 Subject: [PATCH 22/46] Fix DataArray.to_dataframe when the array has MultiIndex (#4442) Co-authored-by: Keewis --- doc/whats-new.rst | 3 +++ xarray/core/coordinates.py | 46 ++++++++++++++++++++++++++++++++-- xarray/tests/test_dataarray.py | 27 ++++++++++++++++++++ 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3e3865b244d..05066f883bf 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -144,6 +144,9 @@ Bug fixes a float64 array (:issue:`4898`, :pull:`4911`). By `Blair Bonnett `_. - Fix decoding of vlen strings using h5py versions greater than 3.0.0 with h5netcdf backend (:issue:`4570`, :pull:`4893`). By `Kai Mühlbauer `_. +- Allow converting :py:class:`Dataset` or :py:class:`DataArray` objects with a ``MultiIndex`` + and at least one other dimension to a ``pandas`` object (:issue:`3008`, :pull:`4442`). + By `ghislainp `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 37c462f79f4..f9445bed619 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -13,6 +13,7 @@ cast, ) +import numpy as np import pandas as pd from . import formatting, indexing @@ -107,8 +108,49 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: return self._data.get_index(dim) # type: ignore else: indexes = [self._data.get_index(k) for k in ordered_dims] # type: ignore - names = list(ordered_dims) - return pd.MultiIndex.from_product(indexes, names=names) + + # compute the sizes of the repeat and tile for the cartesian product + # (taken from pandas.core.reshape.util) + index_lengths = np.fromiter( + (len(index) for index in indexes), dtype=np.intp + ) + cumprod_lengths = np.cumproduct(index_lengths) + + if cumprod_lengths[-1] != 0: + # sizes of the repeats + repeat_counts = cumprod_lengths[-1] / cumprod_lengths + else: + # if any factor is empty, the cartesian product is empty + repeat_counts = np.zeros_like(cumprod_lengths) + + # sizes of the tiles + tile_counts = np.roll(cumprod_lengths, 1) + tile_counts[0] = 1 + + # loop over the indexes + # for each MultiIndex or Index compute the cartesian product of the codes + + code_list = [] + level_list = [] + names = [] + + for i, index in enumerate(indexes): + if isinstance(index, pd.MultiIndex): + codes, levels = index.codes, index.levels + else: + code, level = pd.factorize(index) + codes = [code] + levels = [level] + + # compute the cartesian product + code_list += [ + np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i]) + for code in codes + ] + level_list += levels + names += index.names + + return pd.MultiIndex(level_list, code_list, names=names) def update(self, other: Mapping[Hashable, Any]) -> None: other_vars = getattr(other, "variables", other) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e5cf0bfbc48..68e31cd123a 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3635,6 +3635,33 @@ def test_to_dataframe(self): with raises_regex(ValueError, "unnamed"): arr.to_dataframe() + def test_to_dataframe_multiindex(self): + # regression test for #3008 + arr_np = np.random.randn(4, 3) + + mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"]) + + arr = DataArray(arr_np, [("MI", mindex), ("C", [5, 6, 7])], name="foo") + + actual = arr.to_dataframe() + assert_array_equal(actual["foo"].values, arr_np.flatten()) + assert_array_equal(actual.index.names, list("ABC")) + assert_array_equal(actual.index.levels[0], [1, 2]) + assert_array_equal(actual.index.levels[1], ["a", "b"]) + assert_array_equal(actual.index.levels[2], [5, 6, 7]) + + def test_to_dataframe_0length(self): + # regression test for #3008 + arr_np = np.random.randn(4, 0) + + mindex = pd.MultiIndex.from_product([[1, 2], list("ab")], names=["A", "B"]) + + arr = DataArray(arr_np, [("MI", mindex), ("C", [])], name="foo") + + actual = arr.to_dataframe() + assert len(actual) == 0 + assert_array_equal(actual.index.names, list("ABC")) + def test_to_pandas_name_matches_coordinate(self): # coordinate with same name as array arr = DataArray([1, 2, 3], dims="x", name="x") From 5287c7b2546fc8848f539bb5ee66bb8d91d8496f Mon Sep 17 00:00:00 2001 From: keewis Date: Sun, 21 Feb 2021 21:27:05 +0100 Subject: [PATCH 23/46] add pyproject.toml (#4897) * add a initial pyproject.toml file * reformat the toml file * use setuptools_scm_git_archive to support git archives * add entries for whats-new.rst * remove setup_requires * require setuptools >= 42 since that's required for using pyproject.toml * add a fallback version --- .git_archival.txt | 1 + .gitattributes | 2 ++ doc/whats-new.rst | 5 +++++ pyproject.toml | 11 +++++++++++ setup.cfg | 4 ---- 5 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 .git_archival.txt create mode 100644 pyproject.toml diff --git a/.git_archival.txt b/.git_archival.txt new file mode 100644 index 00000000000..95cb3eea4e3 --- /dev/null +++ b/.git_archival.txt @@ -0,0 +1 @@ +ref-names: $Format:%D$ diff --git a/.gitattributes b/.gitattributes index a52f4ca283a..7a79ddd6b0b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,4 @@ # reduce the number of merge conflicts doc/whats-new.rst merge=union +# allow installing from git archives +.git_archival.txt export-subst diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 05066f883bf..8efb985a175 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,6 +34,9 @@ Breaking changes (:issue:`4688`, :pull:`4720`) By `Justus Magin `_. +- use ``pyproject.toml`` instead of the ``setup_requires`` option for + ``setuptools`` (:pull:`4897`). + By `Justus Magin `_. - As a result of :pull:`4684` the default units encoding for datetime-like values (``np.datetime64[ns]`` or ``cftime.datetime``) will now always be set such that ``int64`` values can be used. In the past, no units @@ -86,6 +89,8 @@ New Features - :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims in the form of kwargs as well as a dict, like most similar methods. By `Maximilian Roos `_. +- Allow installing from git archives (:pull:`4897`). + By `Justus Magin `_. - :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs (including globs for the latter) for ``engine="zarr"``, and so allow reading from diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000000..f1f1a2ac8a6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,11 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel", + "setuptools_scm[toml]>=3.4", + "setuptools_scm_git_archive", +] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +fallback_version = "999" diff --git a/setup.cfg b/setup.cfg index 231865d7788..5037cb9c584 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,10 +77,6 @@ install_requires = numpy >= 1.15 pandas >= 0.25 setuptools >= 40.4 # For pkg_resources -setup_requires = - setuptools >= 40.4 - setuptools_scm - [options.extras_require] io = From ea631f9ba86070431b07c6ffa7c5366db2cc2cfd Mon Sep 17 00:00:00 2001 From: keewis Date: Mon, 22 Feb 2021 06:02:24 +0100 Subject: [PATCH 24/46] update the minimum version policy (#4907) * update the minimum version policy * adapt the minimum versions check script * improve the error message * update the length of the support windows for python and numpy * implement the new policy * Refine wording * add a entry to whats-new.rst * properly format the minimum versions table [skip-ci] * rewrite the error message for too new packages * reformat the policy [skip-ci] * remove the policy override for dask and distributed Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 2 +- ci/min_deps_check.py | 38 ++++++++++++++-------------- doc/installing.rst | 10 ++++---- doc/whats-new.rst | 6 ++++- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 92c7226f81d..32ab03de850 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -184,6 +184,6 @@ jobs: - name: minimum versions policy run: | - mamba install -y pyyaml conda + mamba install -y pyyaml conda python-dateutil python ci/min_deps_check.py ci/requirements/py37-bare-minimum.yml python ci/min_deps_check.py ci/requirements/py37-min-all-deps.yml diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py index 3ffab645e8e..26d20c05745 100755 --- a/ci/min_deps_check.py +++ b/ci/min_deps_check.py @@ -4,11 +4,12 @@ """ import itertools import sys -from datetime import datetime, timedelta +from datetime import datetime from typing import Dict, Iterator, Optional, Tuple import conda.api import yaml +from dateutil.relativedelta import relativedelta CHANNELS = ["conda-forge", "defaults"] IGNORE_DEPS = { @@ -25,14 +26,9 @@ "pytest-xdist", } -POLICY_MONTHS = {"python": 42, "numpy": 24, "setuptools": 42} +POLICY_MONTHS = {"python": 24, "numpy": 18, "setuptools": 42} POLICY_MONTHS_DEFAULT = 12 POLICY_OVERRIDE = { - # dask < 2.9 has trouble with nan-reductions - # TODO remove this special case and the matching note in installing.rst - # after January 2021. - "dask": (2, 9), - "distributed": (2, 9), # setuptools-scm doesn't work with setuptools < 36.7 (Nov 2017). # The conda metadata is malformed for setuptools < 38.4 (Jan 2018) # (it's missing a timestamp which prevents this tool from working). @@ -148,28 +144,32 @@ def process_pkg( return pkg, fmt_version(req_major, req_minor, req_patch), "-", "-", "-", "(!)" policy_months = POLICY_MONTHS.get(pkg, POLICY_MONTHS_DEFAULT) - policy_published = datetime.now() - timedelta(days=policy_months * 30) - - policy_major = req_major - policy_minor = req_minor - policy_published_actual = req_published - for (major, minor), published in reversed(sorted(versions.items())): - if published < policy_published: - break - policy_major = major - policy_minor = minor - policy_published_actual = published + policy_published = datetime.now() - relativedelta(months=policy_months) + + filtered_versions = [ + version + for version, published in versions.items() + if published < policy_published + ] + policy_major, policy_minor = max(filtered_versions, default=(req_major, req_minor)) try: policy_major, policy_minor = POLICY_OVERRIDE[pkg] except KeyError: pass + policy_published_actual = versions[policy_major, policy_minor] if (req_major, req_minor) < (policy_major, policy_minor): status = "<" elif (req_major, req_minor) > (policy_major, policy_minor): status = "> (!)" - error("Package is too new: " + pkg) + delta = relativedelta(datetime.now(), policy_published_actual).normalized() + n_months = delta.years * 12 + delta.months + error( + f"Package is too new: {pkg}={req_major}.{req_minor} was " + f"published on {versions[req_major, req_minor]:%Y-%m-%d} " + f"which was {n_months} months ago (policy is {policy_months} months)" + ) else: status = "=" diff --git a/doc/installing.rst b/doc/installing.rst index 99b8b621aed..396f24b9151 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -98,10 +98,10 @@ Minimum dependency versions xarray adopts a rolling policy regarding the minimum supported version of its dependencies: -- **Python:** 42 months +- **Python:** 24 months (`NEP-29 `_) - **setuptools:** 42 months (but no older than 40.4) -- **numpy:** 24 months +- **numpy:** 18 months (`NEP-29 `_) - **dask and dask.distributed:** 12 months (but no older than 2.9) - **sparse, pint** and other libraries that rely on @@ -111,9 +111,9 @@ dependencies: numpy >=1.17. - **all other libraries:** 12 months -The above should be interpreted as *the minor version (X.Y) initially published no more -than N months ago*. Patch versions (x.y.Z) are not pinned, and only the latest available -at the moment of publishing the xarray release is guaranteed to work. +This means the latest minor (X.Y) version from N months prior. Patch versions (x.y.Z) +are not pinned, and only the latest available at the moment of publishing the xarray +release is guaranteed to work. You can see the actual minimum tested versions: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8efb985a175..0b2b2834626 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,11 @@ Breaking changes ~~~~~~~~~~~~~~~~ - xarray no longer supports python 3.6 + The minimum version policy was changed to also apply to projects with irregular + releases. + The minimum versions of some other dependencies were changed: + ============ ====== ==== Package Old New ============ ====== ==== @@ -32,7 +36,7 @@ Breaking changes setuptools 38.4 40.4 ============ ====== ==== - (:issue:`4688`, :pull:`4720`) + (:issue:`4688`, :pull:`4720`, :pull:`4907`) By `Justus Magin `_. - use ``pyproject.toml`` instead of the ``setup_requires`` option for ``setuptools`` (:pull:`4897`). From 88c5fd2638cd731fa90014a5b0f376ab190441d7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 21 Feb 2021 21:22:04 -0800 Subject: [PATCH 25/46] pre-commit: autoupdate hook versions (#4936) Co-authored-by: keewis --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8bd67c81964..e90164575b4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,7 @@ repos: # - id: velin # args: ["--write", "--compact"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.800 + rev: v0.812 hooks: - id: mypy exclude: "properties|asv_bench" From 200c2b2df28bd477dbec863ac0837b901535c955 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 22 Feb 2021 17:33:50 +0100 Subject: [PATCH 26/46] CI: run mypy in full env (#4929) * CI: run mypy in full env * mypy: show_error_codes * [skip-ci] add skip ci option & version comment * Apply suggestions from code review Co-authored-by: keewis * Update .github/workflows/ci-additional.yaml * Update .github/workflows/ci-additional.yaml Co-authored-by: keewis * update mypy version * Update .pre-commit-config.yaml * [skip-ci] install mypy from file Co-authored-by: keewis --- .github/workflows/ci-additional.yaml | 44 ++++++++++++++++++++++++++++ .pre-commit-config.yaml | 1 + ci/requirements/mypy_only | 3 ++ 3 files changed, 48 insertions(+) create mode 100644 ci/requirements/mypy_only diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 32ab03de850..c5f6a06e349 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -157,6 +157,50 @@ jobs: run: | python -m pytest --doctest-modules xarray --ignore xarray/tests + typing: + name: Type checking (mypy) + runs-on: "ubuntu-latest" + needs: detect-ci-trigger + if: false && needs.detect-ci-trigger.outputs.triggered == 'false' + defaults: + run: + shell: bash -l {0} + + steps: + - name: Cancel previous runs + uses: styfle/cancel-workflow-action@0.6.0 + with: + access_token: ${{ github.token }} + - uses: actions/checkout@v2 + with: + fetch-depth: 0 # Fetch all history for all branches and tags. + - uses: conda-incubator/setup-miniconda@v2 + with: + channels: conda-forge + channel-priority: strict + mamba-version: "*" + activate-environment: xarray-tests + auto-update-conda: false + python-version: "3.8" + + - name: Install conda dependencies + run: | + mamba env update -f ci/requirements/environment.yml + - name: Install mypy + run: | + mamba install --file ci/requirements/mypy_only + - name: Install xarray + run: | + python -m pip install --no-deps -e . + - name: Version info + run: | + conda info -a + conda list + python xarray/util/print_versions.py + - name: Run mypy + run: | + python -m mypy xarray + min-version-policy: name: Minimum Version Policy runs-on: "ubuntu-latest" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e90164575b4..7e967f57e55 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,7 @@ repos: # - id: velin # args: ["--write", "--compact"] - repo: https://github.com/pre-commit/mirrors-mypy + # version must correspond to the one in .github/workflows/ci-additional.yaml rev: v0.812 hooks: - id: mypy diff --git a/ci/requirements/mypy_only b/ci/requirements/mypy_only new file mode 100644 index 00000000000..57fe071ee54 --- /dev/null +++ b/ci/requirements/mypy_only @@ -0,0 +1,3 @@ +# used for the "Type checking (mypy)" CI run +# version must correspond to the one in .pre-commit-config.yaml +mypy=0.812 From f554d0a487d8ee286c96002a09f00379c80bd7f4 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Tue, 23 Feb 2021 11:01:27 -0500 Subject: [PATCH 27/46] Add DataArrayCoarsen.reduce and DatasetCoarsen.reduce methods (#4939) --- doc/api-hidden.rst | 2 ++ doc/whats-new.rst | 4 +++ xarray/core/rolling.py | 62 ++++++++++++++++++++++++++++++++-- xarray/tests/test_dataarray.py | 16 +++++++++ xarray/tests/test_dataset.py | 21 ++++++++++++ 5 files changed, 103 insertions(+), 2 deletions(-) diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index e5492ec73a4..14d79039a3a 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -47,6 +47,7 @@ core.rolling.DatasetCoarsen.median core.rolling.DatasetCoarsen.min core.rolling.DatasetCoarsen.prod + core.rolling.DatasetCoarsen.reduce core.rolling.DatasetCoarsen.std core.rolling.DatasetCoarsen.sum core.rolling.DatasetCoarsen.var @@ -190,6 +191,7 @@ core.rolling.DataArrayCoarsen.median core.rolling.DataArrayCoarsen.min core.rolling.DataArrayCoarsen.prod + core.rolling.DataArrayCoarsen.reduce core.rolling.DataArrayCoarsen.std core.rolling.DataArrayCoarsen.sum core.rolling.DataArrayCoarsen.var diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0b2b2834626..4883548a6a9 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -100,6 +100,10 @@ New Features (including globs for the latter) for ``engine="zarr"``, and so allow reading from many remote and other file systems (:pull:`4461`) By `Martin Durant `_ +- :py:class:`DataArrayCoarsen` and :py:class:`DatasetCoarsen` now implement a + ``reduce`` method, enabling coarsening operations with custom reduction + functions (:issue:`3741`, :pull:`4939`). By `Spencer Clark + `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 6087fd4c806..dbdd9595069 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -836,7 +836,9 @@ class DataArrayCoarsen(Coarsen): _reduce_extra_args_docstring = """""" @classmethod - def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool): + def _reduce_method( + cls, func: Callable, include_skipna: bool = False, numeric_only: bool = False + ): """ Return a wrapped function for injecting reduction methods. see ops.inject_reduce_methods @@ -871,6 +873,38 @@ def wrapped_func(self, **kwargs): return wrapped_func + def reduce(self, func: Callable, **kwargs): + """Reduce the items in this group by applying `func` along some + dimension(s). + + Parameters + ---------- + func : callable + Function which can be called in the form `func(x, axis, **kwargs)` + to return the result of collapsing an np.ndarray over the coarsening + dimensions. It must be possible to provide the `axis` argument + with a tuple of integers. + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Returns + ------- + reduced : DataArray + Array with summarized data. + + Examples + -------- + >>> da = xr.DataArray(np.arange(8).reshape(2, 4), dims=("a", "b")) + >>> coarsen = da.coarsen(b=2) + >>> coarsen.reduce(np.sum) + + array([[ 1, 5], + [ 9, 13]]) + Dimensions without coordinates: a, b + """ + wrapped_func = self._reduce_method(func) + return wrapped_func(self, **kwargs) + class DatasetCoarsen(Coarsen): __slots__ = () @@ -878,7 +912,9 @@ class DatasetCoarsen(Coarsen): _reduce_extra_args_docstring = """""" @classmethod - def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool): + def _reduce_method( + cls, func: Callable, include_skipna: bool = False, numeric_only: bool = False + ): """ Return a wrapped function for injecting reduction methods. see ops.inject_reduce_methods @@ -917,6 +953,28 @@ def wrapped_func(self, **kwargs): return wrapped_func + def reduce(self, func: Callable, **kwargs): + """Reduce the items in this group by applying `func` along some + dimension(s). + + Parameters + ---------- + func : callable + Function which can be called in the form `func(x, axis, **kwargs)` + to return the result of collapsing an np.ndarray over the coarsening + dimensions. It must be possible to provide the `axis` argument with + a tuple of integers. + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Returns + ------- + reduced : Dataset + Arrays with summarized data. + """ + wrapped_func = self._reduce_method(func) + return wrapped_func(self, **kwargs) + inject_reduce_methods(DataArrayCoarsen) inject_reduce_methods(DatasetCoarsen) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 68e31cd123a..d1d36dd93b3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6382,6 +6382,22 @@ def test_coarsen_keep_attrs(): xr.testing.assert_identical(da, da2) +@pytest.mark.parametrize("da", (1, 2), indirect=True) +@pytest.mark.parametrize("window", (1, 2, 3, 4)) +@pytest.mark.parametrize("name", ("sum", "mean", "std", "max")) +def test_coarsen_reduce(da, window, name): + if da.isnull().sum() > 1 and window == 1: + pytest.skip("These parameters lead to all-NaN slices") + + # Use boundary="trim" to accomodate all window sizes used in tests + coarsen_obj = da.coarsen(time=window, boundary="trim") + + # add nan prefix to numpy methods to get similar # behavior as bottleneck + actual = coarsen_obj.reduce(getattr(np, f"nan{name}")) + expected = getattr(coarsen_obj, name)() + assert_allclose(actual, expected) + + @pytest.mark.parametrize("da", (1, 2), indirect=True) def test_rolling_iter(da): rolling_obj = da.rolling(time=7) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c4161093837..2118bc8b780 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6055,6 +6055,27 @@ def test_coarsen_keep_attrs(): xr.testing.assert_identical(ds, ds2) +@pytest.mark.slow +@pytest.mark.parametrize("ds", (1, 2), indirect=True) +@pytest.mark.parametrize("window", (1, 2, 3, 4)) +@pytest.mark.parametrize("name", ("sum", "mean", "std", "var", "min", "max", "median")) +def test_coarsen_reduce(ds, window, name): + # Use boundary="trim" to accomodate all window sizes used in tests + coarsen_obj = ds.coarsen(time=window, boundary="trim") + + # add nan prefix to numpy methods to get similar behavior as bottleneck + actual = coarsen_obj.reduce(getattr(np, f"nan{name}")) + expected = getattr(coarsen_obj, name)() + assert_allclose(actual, expected) + + # make sure the order of data_var are not changed. + assert list(ds.data_vars.keys()) == list(actual.data_vars.keys()) + + # Make sure the dimension order is restored + for key, src_var in ds.data_vars.items(): + assert src_var.dims == actual[key].dims + + @pytest.mark.parametrize( "funcname, argument", [ From fd001f1e45035f4e1027c52d1104c992759fad40 Mon Sep 17 00:00:00 2001 From: keewis Date: Tue, 23 Feb 2021 18:28:32 +0100 Subject: [PATCH 28/46] autoupdate mypy (#4943) * add a script which syncs the mypy versions * call the sync script in the workflow * make sure the CI uses the bot as committer * update the installed dependencies * parse all hook versions at once * update all requirements, not just mypy * include the original requirements text in the debug output * use a re.sub instead of str.replace --- .../workflows/ci-pre-commit-autoupdate.yaml | 7 +- .github/workflows/sync_linter_versions.py | 76 +++++++++++++++++++ 2 files changed, 81 insertions(+), 2 deletions(-) create mode 100755 .github/workflows/sync_linter_versions.py diff --git a/.github/workflows/ci-pre-commit-autoupdate.yaml b/.github/workflows/ci-pre-commit-autoupdate.yaml index 784fd05bcb4..70904200cf7 100644 --- a/.github/workflows/ci-pre-commit-autoupdate.yaml +++ b/.github/workflows/ci-pre-commit-autoupdate.yaml @@ -25,8 +25,8 @@ jobs: uses: actions/setup-python@v2 - name: upgrade pip run: python -m pip install --upgrade pip - - name: install pre-commit - run: python -m pip install --upgrade pre-commit + - name: install dependencies + run: python -m pip install --upgrade pre-commit pyyaml packaging - name: version info run: python -m pip list - name: autoupdate @@ -35,7 +35,10 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} EXECUTE_COMMANDS: | python -m pre_commit autoupdate + python .github/workflows/sync_linter_versions.py .pre-commit-config.yaml ci/requirements/mypy_only COMMIT_MESSAGE: 'pre-commit: autoupdate hook versions' + COMMIT_NAME: 'github-actions[bot]' + COMMIT_EMAIL: 'github-actions[bot]@users.noreply.github.com' PR_TITLE: 'pre-commit: autoupdate hook versions' PR_BRANCH_PREFIX: 'pre-commit/' PR_BRANCH_NAME: 'autoupdate-${PR_ID}' diff --git a/.github/workflows/sync_linter_versions.py b/.github/workflows/sync_linter_versions.py new file mode 100755 index 00000000000..cb0b1355c71 --- /dev/null +++ b/.github/workflows/sync_linter_versions.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +import argparse +import itertools +import pathlib +import re + +import yaml +from packaging import version +from packaging.requirements import Requirement + +operator_re = re.compile("=+") + + +def extract_versions(config): + repos = config.get("repos") + if repos is None: + raise ValueError("invalid pre-commit configuration") + + extracted_versions = ( + ((hook["id"], version.parse(repo["rev"])) for hook in repo["hooks"]) + for repo in repos + ) + return dict(itertools.chain.from_iterable(extracted_versions)) + + +def update_requirement(line, new_versions): + # convert to pep-508 compatible + preprocessed = operator_re.sub("==", line) + requirement = Requirement(preprocessed) + + specifier, *_ = requirement.specifier + old_version = specifier.version + new_version = new_versions.get(requirement.name, old_version) + + new_line = f"{requirement.name}={new_version}" + + return new_line + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--dry", action="store_true") + parser.add_argument( + metavar="pre-commit-config", dest="pre_commit_config", type=pathlib.Path + ) + parser.add_argument("requirements", type=pathlib.Path) + args = parser.parse_args() + + with args.pre_commit_config.open() as f: + config = yaml.safe_load(f) + + versions = extract_versions(config) + mypy_version = versions["mypy"] + + requirements_text = args.requirements.read_text() + requirements = requirements_text.split("\n") + new_requirements = [ + update_requirement(line, versions) + if line and not line.startswith("# ") + else line + for line in requirements + ] + new_requirements_text = "\n".join(new_requirements) + + if args.dry: + separator = "\n" + "—" * 80 + "\n" + print( + "contents of the old requirements file:", + requirements_text, + "contents of the new requirements file:", + new_requirements_text, + sep=separator, + end=separator, + ) + else: + args.requirements.write_text(new_requirements_text) From 348eb481976673ea772bb8424dd2c3c33c0356c2 Mon Sep 17 00:00:00 2001 From: Jens Hedegaard Nielsen Date: Tue, 23 Feb 2021 20:20:55 +0100 Subject: [PATCH 29/46] Use definition of DTypeLike from Numpy if found (#4941) * Use definition of DTypeLike from Numpy And fall back to original definition with older versions of numpy * fix flake8 --- xarray/core/npcompat.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 1018332df29..40576d1fc37 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -75,10 +75,12 @@ def moveaxis(a, source, destination): return result -# Type annotations stubs. See also / to be replaced by: -# https://github.com/numpy/numpy/issues/7370 -# https://github.com/numpy/numpy-stubs/ -DTypeLike = Union[np.dtype, str] +# Type annotations stubs. +try: + from numpy.typing import DTypeLike +except ImportError: + # fall back for numpy < 1.20 + DTypeLike = Union[np.dtype, str] # from dask/array/utils.py From 0f65307d6bd611767863edc50a2a755b9bb819ff Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Tue, 23 Feb 2021 21:52:47 +0100 Subject: [PATCH 30/46] typing for numpy 1.20 (#4878) * typing for numpy 1.20 * [skip-ci] add whats-new.rst * update formatting * -> np.dtype * fix bug, use Mapping, check for dict-like * enable typing CI * fixes * remove some unnecessary ignores again --- .github/workflows/ci-additional.yaml | 2 +- doc/whats-new.rst | 1 + xarray/core/accessor_dt.py | 6 +++-- xarray/core/common.py | 37 +++++++++++++++++++++++++--- xarray/core/dataset.py | 6 ++--- xarray/core/formatting.py | 5 ++-- xarray/core/indexing.py | 16 ++++++------ xarray/core/npcompat.py | 4 +-- xarray/core/nputils.py | 2 +- xarray/tests/test_cftime_offsets.py | 2 +- xarray/tests/test_dataarray.py | 3 +++ xarray/tests/test_variable.py | 3 +++ 12 files changed, 62 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index c5f6a06e349..4bf85458211 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -161,7 +161,7 @@ jobs: name: Type checking (mypy) runs-on: "ubuntu-latest" needs: detect-ci-trigger - if: false && needs.detect-ci-trigger.outputs.triggered == 'false' + if: needs.detect-ci-trigger.outputs.triggered == 'false' defaults: run: shell: bash -l {0} diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4883548a6a9..4f703f01007 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -190,6 +190,7 @@ Internal Changes in ipython (:issue:`4741`, :pull:`4742`). By `Richard Kleijn `_. - Added the ``set_close`` method to ``Dataset`` and ``DataArray`` for beckends to specify how to voluntary release all resources. (:pull:`#4809`), By `Alessandro Amici `_. +- Update type hints to work with numpy v1.20 (:pull:`4878`). By `Mathias Hauser `_. - Ensure warnings cannot be turned into exceptions in :py:func:`testing.assert_equal` and the other ``assert_*`` functions (:pull:`4864`). By `Mathias Hauser `_. - Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables. diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index ec67534c651..561d5d30a79 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -9,6 +9,7 @@ is_np_datetime_like, is_np_timedelta_like, ) +from .npcompat import DTypeLike from .pycompat import is_duck_dask_array @@ -178,8 +179,9 @@ class Properties: def __init__(self, obj): self._obj = obj - def _tslib_field_accessor( # type: ignore - name: str, docstring: str = None, dtype: np.dtype = None + @staticmethod + def _tslib_field_accessor( + name: str, docstring: str = None, dtype: DTypeLike = None ): def f(self, dtype=dtype): if dtype is None: diff --git a/xarray/core/common.py b/xarray/core/common.py index 88155234020..db91ec85317 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -16,6 +16,7 @@ Tuple, TypeVar, Union, + overload, ) import numpy as np @@ -35,6 +36,8 @@ if TYPE_CHECKING: from .dataarray import DataArray + from .dataset import Dataset + from .variable import Variable from .weighted import Weighted T_DataWithCoords = TypeVar("T_DataWithCoords", bound="DataWithCoords") @@ -1501,7 +1504,26 @@ def __getitem__(self, value): raise NotImplementedError() -def full_like(other, fill_value, dtype: DTypeLike = None): +@overload +def full_like( + other: "Dataset", + fill_value, + dtype: Union[DTypeLike, Mapping[Hashable, DTypeLike]] = None, +) -> "Dataset": + ... + + +@overload +def full_like(other: "DataArray", fill_value, dtype: DTypeLike = None) -> "DataArray": + ... + + +@overload +def full_like(other: "Variable", fill_value, dtype: DTypeLike = None) -> "Variable": + ... + + +def full_like(other, fill_value, dtype=None): """Return a new object with the same shape and type as a given object. Parameters @@ -1618,15 +1640,22 @@ def full_like(other, fill_value, dtype: DTypeLike = None): f"fill_value must be scalar or, for datasets, a dict-like. Received {fill_value} instead." ) + if not isinstance(other, Dataset) and isinstance(dtype, Mapping): + raise ValueError( + "'dtype' cannot be dict-like when passing a DataArray or Variable" + ) + if isinstance(other, Dataset): if not isinstance(fill_value, dict): fill_value = {k: fill_value for k in other.data_vars.keys()} - if not isinstance(dtype, dict): - dtype = {k: dtype for k in other.data_vars.keys()} + if not isinstance(dtype, Mapping): + dtype_ = {k: dtype for k in other.data_vars.keys()} + else: + dtype_ = dtype data_vars = { - k: _full_like_variable(v, fill_value.get(k, dtypes.NA), dtype.get(k, None)) + k: _full_like_variable(v, fill_value.get(k, dtypes.NA), dtype_.get(k, None)) for k, v in other.data_vars.items() } return Dataset(data_vars, coords=other.coords, attrs=other.attrs) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 066a2f690b0..bdf29eda197 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4311,7 +4311,7 @@ def dropna( subset = iter(self.data_vars) count = np.zeros(self.dims[dim], dtype=np.int64) - size = 0 + size = np.int_(0) # for type checking for k in subset: array = self._variables[k] @@ -6370,7 +6370,7 @@ def polyfit( lhs = np.vander(x, order) if rcond is None: - rcond = x.shape[0] * np.core.finfo(x.dtype).eps + rcond = x.shape[0] * np.core.finfo(x.dtype).eps # type: ignore # Weights: if w is not None: @@ -6414,7 +6414,7 @@ def polyfit( # deficient ranks nor does it output the "full" info (issue dask/dask#6516) skipna_da = True elif skipna is None: - skipna_da = np.any(da.isnull()) + skipna_da = bool(np.any(da.isnull())) dims_to_stack = [dimname for dimname in da.dims if dimname != dim] stacked_coords: Dict[Hashable, DataArray] = {} diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 0c1be1cc175..2ce6b497290 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -189,9 +189,8 @@ def format_array_flat(array, max_width: int): (max_possibly_relevant < array.size) or (cum_len > max_width).any() ): padding = " ... " - count = min( - array.size, max(np.argmax(cum_len + len(padding) - 1 > max_width), 2) - ) + max_len = max(np.argmax(cum_len + len(padding) - 1 > max_width), 2) # type: ignore + count = min(array.size, max_len) else: count = array.size padding = "" if (count <= 1) else " " diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index dff6d75d5b7..1cac5e89906 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -4,7 +4,7 @@ from collections import defaultdict from contextlib import suppress from datetime import timedelta -from typing import Any, Callable, Iterable, Sequence, Tuple, Union +from typing import Any, Callable, Iterable, List, Sequence, Tuple, Union import numpy as np import pandas as pd @@ -1010,7 +1010,7 @@ def _decompose_outer_indexer( return indexer, BasicIndexer(()) assert isinstance(indexer, (OuterIndexer, BasicIndexer)) - backend_indexer = [] + backend_indexer: List[Any] = [] np_indexer = [] # make indexer positive pos_indexer = [] @@ -1397,17 +1397,17 @@ def __init__(self, array: Any, dtype: DTypeLike = None): self.array = utils.safe_cast_to_index(array) if dtype is None: if isinstance(array, pd.PeriodIndex): - dtype = np.dtype("O") + dtype_ = np.dtype("O") elif hasattr(array, "categories"): # category isn't a real numpy dtype - dtype = array.categories.dtype + dtype_ = array.categories.dtype elif not utils.is_valid_numpy_dtype(array.dtype): - dtype = np.dtype("O") + dtype_ = np.dtype("O") else: - dtype = array.dtype + dtype_ = array.dtype else: - dtype = np.dtype(dtype) - self._dtype = dtype + dtype_ = np.dtype(dtype) + self._dtype = dtype_ @property def dtype(self) -> np.dtype: diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 40576d1fc37..25c103374b8 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -75,12 +75,12 @@ def moveaxis(a, source, destination): return result -# Type annotations stubs. +# Type annotations stubs try: from numpy.typing import DTypeLike except ImportError: # fall back for numpy < 1.20 - DTypeLike = Union[np.dtype, str] + DTypeLike = Union[np.dtype, str] # type: ignore # from dask/array/utils.py diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 7e382903046..926f7691ed7 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from numpy.core.multiarray import normalize_axis_index +from numpy.core.multiarray import normalize_axis_index # type: ignore try: import bottleneck as bn diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index b1ecf059f2f..16f6d6827e3 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -479,7 +479,7 @@ def test_minus_offset(a, b): @pytest.mark.parametrize( ("a", "b"), - list(zip(np.roll(_EQ_TESTS_A, 1), _EQ_TESTS_B)) + list(zip(np.roll(_EQ_TESTS_A, 1), _EQ_TESTS_B)) # type: ignore + [(YearEnd(month=1), YearEnd(month=2))], ids=_id_func, ) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d1d36dd93b3..b28a53023ed 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4200,6 +4200,9 @@ def test_full_like(self): assert expect.dtype == bool assert_identical(expect, actual) + with pytest.raises(ValueError, match="'dtype' cannot be dict-like"): + full_like(da, fill_value=True, dtype={"x": bool}) + def test_dot(self): x = np.linspace(-3, 3, 6) y = np.linspace(-3, 3, 5) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index f9ef8f57ef9..7cb62b4d85f 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2248,6 +2248,9 @@ def test_full_like(self): with raises_regex(ValueError, "must be scalar"): full_like(orig, [1.0, 2.0]) + with pytest.raises(ValueError, match="'dtype' cannot be dict-like"): + full_like(orig, True, dtype={"x": bool}) + @requires_dask def test_full_like_dask(self): orig = Variable( From df052e7431540fb435ac8742aabc32754a00a7f5 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Tue, 23 Feb 2021 23:37:07 +0100 Subject: [PATCH 31/46] Upstream CI: limit runtime (#4946) * Upstream CI: limit runtime * [test-upstream] run upstream * Update .github/workflows/upstream-dev-ci.yaml * [test-upstream] run upstream * [test-upstream] limit to 60 s * [test-upstream] update parse_logs.py * [test-upstream] run upstream --- .github/workflows/parse_logs.py | 2 +- .github/workflows/upstream-dev-ci.yaml | 2 +- ci/install-upstream-wheels.sh | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/parse_logs.py b/.github/workflows/parse_logs.py index 4d3bea54e50..545beaa4167 100644 --- a/.github/workflows/parse_logs.py +++ b/.github/workflows/parse_logs.py @@ -18,7 +18,7 @@ def extract_short_test_summary_info(lines): ) up_to_section_content = itertools.islice(up_to_start_of_section, 1, None) section_content = itertools.takewhile( - lambda l: l.startswith("FAILED"), up_to_section_content + lambda l: l.startswith("FAILED") or l.startswith("ERROR"), up_to_section_content ) content = "\n".join(section_content) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index bba7c04a9c2..e55be4da329 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -77,7 +77,7 @@ jobs: id: status run: | set -euo pipefail - python -m pytest -rf | tee output-${{ matrix.python-version }}-log || ( + python -m pytest --timeout=60 -rf | tee output-${{ matrix.python-version }}-log || ( echo '::set-output name=ARTIFACTS_AVAILABLE::true' && false ) - name: Upload artifacts diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index fe3e706f6a6..8458a8df352 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -16,6 +16,8 @@ conda uninstall -y --force \ pint \ bottleneck \ sparse +# to limit the runtime of Upstream CI +python -m pip install pytest-timeout python -m pip install \ -i https://pypi.anaconda.org/scipy-wheels-nightly/simple \ --no-deps \ From 63f2e5da5391a8a471a2747335a5d9a3e1a43b3b Mon Sep 17 00:00:00 2001 From: keewis Date: Wed, 24 Feb 2021 21:30:35 +0100 Subject: [PATCH 32/46] bump the dependencies (#4942) * update all dependencies * document the bumped dependencies [skip-ci] * don't list the indirect dependencies [skip-ci] * minor fix * remove the comment about conflicts with h5py=2.10 [skip-ci] * update whats-new.rst [skip-ci] --- ci/requirements/py37-bare-minimum.yml | 4 ++-- ci/requirements/py37-min-all-deps.yml | 34 +++++++++++++-------------- doc/whats-new.rst | 19 +++++++++++---- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/ci/requirements/py37-bare-minimum.yml b/ci/requirements/py37-bare-minimum.yml index fbeb87032b7..408cf76fdd6 100644 --- a/ci/requirements/py37-bare-minimum.yml +++ b/ci/requirements/py37-bare-minimum.yml @@ -10,6 +10,6 @@ dependencies: - pytest-cov - pytest-env - pytest-xdist - - numpy=1.15 - - pandas=0.25 + - numpy=1.17 + - pandas=1.0 - setuptools=40.4 diff --git a/ci/requirements/py37-min-all-deps.yml b/ci/requirements/py37-min-all-deps.yml index 166836243b4..c2fd2e18a8a 100644 --- a/ci/requirements/py37-min-all-deps.yml +++ b/ci/requirements/py37-min-all-deps.yml @@ -8,42 +8,42 @@ dependencies: # When upgrading python, numpy, or pandas, must also change # doc/installing.rst and setup.py. - python=3.7 - - boto3=1.9 - - bottleneck=1.2 + - boto3=1.12 + - bottleneck=1.3 - cartopy=0.17 - cdms2=3.1 - cfgrib=0.9 - cftime=1.0 - coveralls - - dask=2.9 - - distributed=2.9 - - h5netcdf=0.7 - - h5py=2.9 # Policy allows for 2.10, but it's a conflict-fest + - dask=2.11 + - distributed=2.11 + - h5netcdf=0.8 + - h5py=2.10 - hdf5=1.10 - hypothesis - - iris=2.2 - - lxml=4.4 # Optional dep of pydap + - iris=2.4 + - lxml=4.5 # Optional dep of pydap - matplotlib-base=3.1 - nc-time-axis=1.2 - - netcdf4=1.4 - - numba=0.46 - - numpy=1.15 - - pandas=0.25 + - netcdf4=1.5 + - numba=0.48 + - numpy=1.17 + - pandas=1.0 # - pint # See py37-min-nep18.yml - pip - - pseudonetcdf=3.0 + - pseudonetcdf=3.1 - pydap=3.2 - pynio=1.5 - pytest - pytest-cov - pytest-env - pytest-xdist - - rasterio=1.0 - - scipy=1.3 - - seaborn=0.9 + - rasterio=1.1 + - scipy=1.4 + - seaborn=0.10 - setuptools=40.4 # - sparse # See py37-min-nep18.yml - toolz=0.10 - - zarr=2.3 + - zarr=2.4 - pip: - numbagg==0.1 diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4f703f01007..5051a64af79 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,18 +25,29 @@ Breaking changes - xarray no longer supports python 3.6 The minimum version policy was changed to also apply to projects with irregular - releases. - - The minimum versions of some other dependencies were changed: + releases. As a result, the minimum versions of some dependencies have changed: ============ ====== ==== Package Old New ============ ====== ==== Python 3.6 3.7 setuptools 38.4 40.4 + numpy 1.15 1.17 + pandas 0.25 1.0 + dask 2.9 2.11 + distributed 2.9 2.11 + bottleneck 1.2 1.3 + h5netcdf 0.7 0.8 + iris 2.2 2.4 + netcdf4 1.4 1.5 + pseudonetcdf 3.0 3.1 + rasterio 1.0 1.1 + scipy 1.3 1.4 + seaborn 0.9 0.10 + zarr 2.3 2.4 ============ ====== ==== - (:issue:`4688`, :pull:`4720`, :pull:`4907`) + (:issue:`4688`, :pull:`4720`, :pull:`4907`, :pull:`4942`) By `Justus Magin `_. - use ``pyproject.toml`` instead of the ``setup_requires`` option for ``setuptools`` (:pull:`4897`). From 351b0aac671a910803dcf618a18bb33cc8f8f2ab Mon Sep 17 00:00:00 2001 From: keewis Date: Wed, 24 Feb 2021 22:17:25 +0100 Subject: [PATCH 33/46] document update as inplace (#4932) * explicitly state that update works inplace * point to assign * update whats-new.rst [skip-ci] * rewrite the docstring [skip-ci] * deprecate the return value of Dataset.update * add the issue and pull request numbers [skip-ci] * add a ETA for the removal of the return value [skip-ci] --- doc/whats-new.rst | 5 +++++ xarray/core/dataset.py | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5051a64af79..a181c2b3320 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -79,6 +79,9 @@ Deprecations For now using ``dim`` issues a ``FutureWarning``. It will be removed in version 0.19.0 (:pull:`3993`). By `Tom Nicholas `_. +- the return value of :py:meth:`Dataset.update` is being deprecated to make it work more + like :py:meth:`dict.update`. It will be removed in version 0.19.0 (:pull:`4932`). + By `Justus Magin `_. New Features @@ -181,6 +184,8 @@ Documentation - add concat examples and improve combining documentation (:issue:`4620`, :pull:`4645`). By `Ray Bell `_ and `Justus Magin `_. +- explicitly mention that :py:meth:`Dataset.update` updates inplace (:issue:`2951`, :pull:`4932`). + By `Justus Magin `_. Internal Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bdf29eda197..9faf74dd4bc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3865,6 +3865,8 @@ def unstack( def update(self, other: "CoercibleMapping") -> "Dataset": """Update this dataset's variables with those from another dataset. + Just like :py:meth:`dict.update` this is a in-place operation. + Parameters ---------- other : Dataset or mapping @@ -3879,13 +3881,20 @@ def update(self, other: "CoercibleMapping") -> "Dataset": Returns ------- updated : Dataset - Updated dataset. + Updated dataset. Note that since the update is in-place this is the input + dataset. + + It is deprecated since version 0.17 and scheduled to be removed in 0.19. Raises ------ ValueError If any dimensions would have inconsistent sizes in the updated dataset. + + See Also + -------- + Dataset.assign """ merge_result = dataset_update_method(self, other) return self._replace(inplace=True, **merge_result._asdict()) From eda5f1dc2c58da37b04370e04a5b89ba2485e84a Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Wed, 24 Feb 2021 16:54:45 -0800 Subject: [PATCH 34/46] Add 0.17.0 release notes (#4953) * Add 0.17.0 release notes * _ * Apply suggestions from code review * reflow release message * fix some whats-new.rst entries * fix the deprecations section * fix more entries * minor fix * more fixes Co-authored-by: keewis Co-authored-by: Keewis --- doc/whats-new.rst | 169 ++++++++++++++++++++++++++++------------------ 1 file changed, 105 insertions(+), 64 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a181c2b3320..ef7f5b43fdd 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,10 +15,24 @@ What's New np.random.seed(123456) -.. _whats-new.0.16.3: +.. _whats-new.0.17.0: -v0.17.0 (unreleased) --------------------- +v0.17.0 (24 Feb 2021) +--------------------- + +This release brings a few important performance improvements, a wide range of +usability upgrades, lots of bug fixes, and some new features. These include +better ``cftime`` support, a new quiver plot, better ``unstack`` performance, +more efficient memory use in rolling operations, and some python packaging +improvements. We also have a few documentation improvements (and more planned!). + +Many thanks to the 36 contributors to this release: Alessandro Amici, Anderson +Banihirwe, Aureliana Barghini, Ayrton Bourn, Benjamin Bean, Blair Bonnett, Chun +Ho Chow, DWesl, Daniel Mesejo-León, Deepak Cherian, Eric Keenan, Illviljan, Jens +Hedegaard Nielsen, Jody Klymak, Julien Seguinot, Julius Busecke, Kai Mühlbauer, +Leif Denby, Martin Durant, Mathias Hauser, Maximilian Roos, Michael Mann, Ray +Bell, RichardScottOZ, Spencer Clark, Tim Gates, Tom Nicholas, Yunus Sevinchan, +alexamici, aurghs, crusaderky, dcherian, ghislainp, keewis, rhkleijn Breaking changes ~~~~~~~~~~~~~~~~ @@ -48,24 +62,17 @@ Breaking changes ============ ====== ==== (:issue:`4688`, :pull:`4720`, :pull:`4907`, :pull:`4942`) - By `Justus Magin `_. -- use ``pyproject.toml`` instead of the ``setup_requires`` option for - ``setuptools`` (:pull:`4897`). - By `Justus Magin `_. - As a result of :pull:`4684` the default units encoding for datetime-like values (``np.datetime64[ns]`` or ``cftime.datetime``) will now always be set such that ``int64`` values can be used. In the past, no units finer than "seconds" were chosen, which would sometimes mean that ``float64`` values were required, which would lead to inaccurate I/O round-trips. - Variables referred to in attributes like ``bounds`` and ``grid_mapping`` - are can be set as coordinate variables. These attributes - are moved to :py:attr:`DataArray.encoding` from - :py:attr:`DataArray.attrs`. This behaviour is controlled by the - ``decode_coords`` kwarg to :py:func:`open_dataset` and + can be set as coordinate variables. These attributes are moved to + :py:attr:`DataArray.encoding` from :py:attr:`DataArray.attrs`. This behaviour + is controlled by the ``decode_coords`` kwarg to :py:func:`open_dataset` and :py:func:`open_mfdataset`. The full list of decoded attributes is in :ref:`weather-climate` (:pull:`2844`, :issue:`3689`) -- remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`). - By `Aureliana Barghini `_. - As a result of :pull:`4911` the output from calling :py:meth:`DataArray.sum` or :py:meth:`DataArray.prod` on an integer array with ``skipna=True`` and a non-None value for ``min_count`` will now be a float array rather than an @@ -79,45 +86,48 @@ Deprecations For now using ``dim`` issues a ``FutureWarning``. It will be removed in version 0.19.0 (:pull:`3993`). By `Tom Nicholas `_. +- Deprecated ``autoclose`` kwargs from :py:func:`open_dataset` are removed (:pull:`4725`). + By `Aureliana Barghini `_. - the return value of :py:meth:`Dataset.update` is being deprecated to make it work more like :py:meth:`dict.update`. It will be removed in version 0.19.0 (:pull:`4932`). By `Justus Magin `_. - New Features ~~~~~~~~~~~~ -- Xarray now leverages updates as of cftime version 1.4.1, which enable exact I/O - roundtripping of ``cftime.datetime`` objects (:pull:`4758`). - By `Spencer Clark `_. -- Most rolling operations use significantly less memory. (:issue:`4325`). - By `Deepak Cherian `_. - :py:meth:`~xarray.cftime_range` and :py:meth:`DataArray.resample` now support millisecond (``"L"`` or ``"ms"``) and microsecond (``"U"`` or ``"us"``) frequencies for ``cftime.datetime`` coordinates (:issue:`4097`, :pull:`4758`). By `Spencer Clark `_. - Significantly higher ``unstack`` performance on numpy-backed arrays which - contain missing values; 8x faster in our benchmark, and 2x faster than pandas. - (:pull:`4746`); + contain missing values; 8x faster than previous versions in our benchmark, and + now 2x faster than pandas (:pull:`4746`). By `Maximilian Roos `_. - Add :py:meth:`Dataset.plot.quiver` for quiver plots with :py:class:`Dataset` variables. By `Deepak Cherian `_. -- add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg +- Add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg (:issue:`4749`, :pull:`4827`). By `Justus Magin `_. -- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims - in the form of kwargs as well as a dict, like most similar methods. - By `Maximilian Roos `_. - Allow installing from git archives (:pull:`4897`). By `Justus Magin `_. - -- :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs - (including globs for the latter) for ``engine="zarr"``, and so allow reading from - many remote and other file systems (:pull:`4461`) - By `Martin Durant `_ - :py:class:`DataArrayCoarsen` and :py:class:`DatasetCoarsen` now implement a ``reduce`` method, enabling coarsening operations with custom reduction functions (:issue:`3741`, :pull:`4939`). By `Spencer Clark `_. +- Most rolling operations use significantly less memory. (:issue:`4325`). + By `Deepak Cherian `_. +- Add :py:meth:`Dataset.drop_isel` and :py:meth:`DataArray.drop_isel` + (:issue:`4658`, :pull:`4819`). + By `Daniel Mesejo `_. +- Xarray now leverages updates as of cftime version 1.4.1, which enable exact I/O + roundtripping of ``cftime.datetime`` objects (:pull:`4758`). + By `Spencer Clark `_. +- :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs + (including globs for the latter) for ``engine="zarr"``, and so allow reading from + many remote and other file systems (:pull:`4461`) + By `Martin Durant `_ +- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims + in the form of kwargs as well as a dict, like most similar methods. + By `Maximilian Roos `_. Bug fixes ~~~~~~~~~ @@ -125,29 +135,38 @@ Bug fixes :py:func:`~xarray.core.variable.as_compatible_data` instead of blanket access to ``values`` attribute (:issue:`2097`) By `Yunus Sevinchan `_. -- :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke `_. -- :py:func:`merge` with ``combine_attrs='override'`` makes a copy of the attrs (:issue:`4627`). -- By default, when possible, xarray will now always use values of type ``int64`` when encoding - and decoding ``numpy.datetime64[ns]`` datetimes. This ensures that maximum - precision and accuracy are maintained in the round-tripping process - (:issue:`4045`, :pull:`4684`). It also enables encoding and decoding standard calendar - dates with time units of nanoseconds (:pull:`4400`). By `Spencer Clark - `_ and `Mark Harfouche `_. +- :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger + computations anymore if :py:meth:`Dataset.weighted` or + :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By + `Julius Busecke `_. +- :py:func:`merge` with ``combine_attrs='override'`` makes a copy of the attrs + (:issue:`4627`). +- By default, when possible, xarray will now always use values of + type ``int64`` when encoding and decoding ``numpy.datetime64[ns]`` datetimes. This + ensures that maximum precision and accuracy are maintained in the round-tripping + process (:issue:`4045`, :pull:`4684`). It also enables encoding and decoding standard + calendar dates with time units of nanoseconds (:pull:`4400`). + By `Spencer Clark `_ and `Mark Harfouche + `_. - :py:meth:`DataArray.astype`, :py:meth:`Dataset.astype` and :py:meth:`Variable.astype` support the ``order`` and ``subok`` parameters again. This fixes a regression introduced in version 0.16.1 (:issue:`4644`, :pull:`4683`). By `Richard Kleijn `_ . - Remove dictionary unpacking when using ``.loc`` to avoid collision with ``.sel`` parameters (:pull:`4695`). - By `Anderson Banihirwe `_ + By `Anderson Banihirwe `_. - Fix the legend created by :py:meth:`Dataset.plot.scatter` (:issue:`4641`, :pull:`4723`). By `Justus Magin `_. -- Fix a crash in orthogonal indexing on geographic coordinates with ``engine='cfgrib'`` (:issue:`4733` :pull:`4737`). - By `Alessandro Amici `_ +- Fix a crash in orthogonal indexing on geographic coordinates with ``engine='cfgrib'`` + (:issue:`4733` :pull:`4737`). + By `Alessandro Amici `_. - Coordinates with dtype ``str`` or ``bytes`` now retain their dtype on many operations, e.g. ``reindex``, ``align``, ``concat``, ``assign``, previously they were cast to an object dtype - (:issue:`2658` and :issue:`4543`) by `Mathias Hauser `_. -- Limit number of data rows when printing large datasets. (:issue:`4736`, :pull:`4750`). By `Jimmy Westling `_. -- Add ``missing_dims`` parameter to transpose (:issue:`4647`, :pull:`4767`). By `Daniel Mesejo `_. + (:issue:`2658` and :issue:`4543`). + By `Mathias Hauser `_. +- Limit number of data rows when printing large datasets. (:issue:`4736`, :pull:`4750`). + By `Jimmy Westling `_. +- Add ``missing_dims`` parameter to transpose (:issue:`4647`, :pull:`4767`). + By `Daniel Mesejo `_. - Resolve intervals before appending other metadata to labels when plotting (:issue:`4322`, :pull:`4794`). By `Justus Magin `_. - Fix regression when decoding a variable with a ``scale_factor`` and ``add_offset`` given @@ -158,8 +177,9 @@ Bug fixes - Raise DeprecationWarning when trying to typecast a tuple containing a :py:class:`DataArray`. User now prompted to first call `.data` on it (:issue:`4483`). By `Chun Ho Chow `_. -- Add :py:meth:`Dataset.drop_isel` and :py:meth:`DataArray.drop_isel` (:issue:`4658`, :pull:`4819`). By `Daniel Mesejo `_. -- Ensure that :py:meth:`Dataset.interp` raises ``ValueError`` when interpolating outside coordinate range and ``bounds_error=True`` (:issue:`4854`, :pull:`4855`). +- Ensure that :py:meth:`Dataset.interp` raises ``ValueError`` when interpolating + outside coordinate range and ``bounds_error=True`` (:issue:`4854`, + :pull:`4855`). By `Leif Denby `_. - Fix time encoding bug associated with using cftime versions greater than 1.4.0 with xarray (:issue:`4870`, :pull:`4871`). By `Spencer Clark `_. @@ -177,15 +197,17 @@ Bug fixes Documentation ~~~~~~~~~~~~~ -- add information about requirements for accessor classes (:issue:`2788`, :pull:`4657`). +- Add information about requirements for accessor classes (:issue:`2788`, :pull:`4657`). By `Justus Magin `_. -- start a list of external I/O integrating with ``xarray`` (:issue:`683`, :pull:`4566`). +- Start a list of external I/O integrating with ``xarray`` (:issue:`683`, :pull:`4566`). By `Justus Magin `_. -- add concat examples and improve combining documentation (:issue:`4620`, :pull:`4645`). +- Add concat examples and improve combining documentation (:issue:`4620`, :pull:`4645`). By `Ray Bell `_ and `Justus Magin `_. - explicitly mention that :py:meth:`Dataset.update` updates inplace (:issue:`2951`, :pull:`4932`). By `Justus Magin `_. +- Added docs on vectorized indexing (:pull:`4711`). + By `Eric Keenan `_. Internal Changes ~~~~~~~~~~~~~~~~ @@ -197,30 +219,49 @@ Internal Changes - Run the tests in parallel using pytest-xdist (:pull:`4694`). By `Justus Magin `_ and `Mathias Hauser `_. - +- Use ``pyproject.toml`` instead of the ``setup_requires`` option for + ``setuptools`` (:pull:`4897`). + By `Justus Magin `_. - Replace all usages of ``assert x.identical(y)`` with ``assert_identical(x, y)`` - for clearer error messages. - (:pull:`4752`); + for clearer error messages (:pull:`4752`). By `Maximilian Roos `_. -- Speed up attribute style access (e.g. ``ds.somevar`` instead of ``ds["somevar"]``) and tab completion - in ipython (:issue:`4741`, :pull:`4742`). By `Richard Kleijn `_. -- Added the ``set_close`` method to ``Dataset`` and ``DataArray`` for beckends to specify how to voluntary release - all resources. (:pull:`#4809`), By `Alessandro Amici `_. -- Update type hints to work with numpy v1.20 (:pull:`4878`). By `Mathias Hauser `_. +- Speed up attribute style access (e.g. ``ds.somevar`` instead of ``ds["somevar"]``) and + tab completion in IPython (:issue:`4741`, :pull:`4742`). + By `Richard Kleijn `_. +- Added the ``set_close`` method to ``Dataset`` and ``DataArray`` for backends + to specify how to voluntary release all resources. (:pull:`#4809`) + By `Alessandro Amici `_. +- Update type hints to work with numpy v1.20 (:pull:`4878`). + By `Mathias Hauser `_. - Ensure warnings cannot be turned into exceptions in :py:func:`testing.assert_equal` and - the other ``assert_*`` functions (:pull:`4864`). By `Mathias Hauser `_. -- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables. - By `Deepak Cherian `_ + the other ``assert_*`` functions (:pull:`4864`). + By `Mathias Hauser `_. +- Performance improvement when constructing DataArrays. Significantly speeds up + repr for Datasets with large number of variables. + By `Deepak Cherian `_. .. _whats-new.0.16.2: v0.16.2 (30 Nov 2020) --------------------- -This release brings the ability to write to limited regions of ``zarr`` files, open zarr files with :py:func:`open_dataset` and :py:func:`open_mfdataset`, increased support for propagating ``attrs`` using the ``keep_attrs`` flag, as well as numerous bugfixes and documentation improvements. - -Many thanks to the 31 contributors who contributed to this release: -Aaron Spring, Akio Taniguchi, Aleksandar Jelenak, alexamici, Alexandre Poux, Anderson Banihirwe, Andrew Pauling, Ashwin Vishnu, aurghs, Brian Ward, Caleb, crusaderky, Dan Nowacki, darikg, David Brochart, David Huard, Deepak Cherian, Dion Häfner, Gerardo Rivera, Gerrit Holl, Illviljan, inakleinbottle, Jacob Tomlinson, James A. Bednar, jenssss, Joe Hamman, johnomotani, Joris Van den Bossche, Julia Kent, Julius Busecke, Kai Mühlbauer, keewis, Keisuke Fujii, Kyle Cranmer, Luke Volpatti, Mathias Hauser, Maximilian Roos, Michaël Defferrard, Michal Baumgartner, Nick R. Papior, Pascal Bourgault, Peter Hausamann, PGijsbers, Ray Bell, Romain Martinez, rpgoldman, Russell Manser, Sahid Velji, Samnan Rahee, Sander, Spencer Clark, Stephan Hoyer, Thomas Zilio, Tobias Kölling, Tom Augspurger, Wei Ji, Yash Saboo, Zeb Nicholls, +This release brings the ability to write to limited regions of ``zarr`` files, +open zarr files with :py:func:`open_dataset` and :py:func:`open_mfdataset`, +increased support for propagating ``attrs`` using the ``keep_attrs`` flag, as +well as numerous bugfixes and documentation improvements. + +Many thanks to the 31 contributors who contributed to this release: Aaron +Spring, Akio Taniguchi, Aleksandar Jelenak, alexamici, Alexandre Poux, Anderson +Banihirwe, Andrew Pauling, Ashwin Vishnu, aurghs, Brian Ward, Caleb, crusaderky, +Dan Nowacki, darikg, David Brochart, David Huard, Deepak Cherian, Dion Häfner, +Gerardo Rivera, Gerrit Holl, Illviljan, inakleinbottle, Jacob Tomlinson, James +A. Bednar, jenssss, Joe Hamman, johnomotani, Joris Van den Bossche, Julia Kent, +Julius Busecke, Kai Mühlbauer, keewis, Keisuke Fujii, Kyle Cranmer, Luke +Volpatti, Mathias Hauser, Maximilian Roos, Michaël Defferrard, Michal +Baumgartner, Nick R. Papior, Pascal Bourgault, Peter Hausamann, PGijsbers, Ray +Bell, Romain Martinez, rpgoldman, Russell Manser, Sahid Velji, Samnan Rahee, +Sander, Spencer Clark, Stephan Hoyer, Thomas Zilio, Tobias Kölling, Tom +Augspurger, Wei Ji, Yash Saboo, Zeb Nicholls, Deprecations ~~~~~~~~~~~~ From 835a53e62b1e5018faa323c649149e8294dd6af7 Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Thu, 25 Feb 2021 03:46:41 -0500 Subject: [PATCH 35/46] DOC: rm np import (#4949) * DOC: add xr import * DOC: remove imports --- xarray/core/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index db91ec85317..321bd632811 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1188,7 +1188,6 @@ def where(self, cond, other=dtypes.NA, drop: bool = False): Examples -------- - >>> import numpy as np >>> a = xr.DataArray(np.arange(25).reshape(5, 5), dims=("x", "y")) >>> a From f74c446332feba47615fa02e75cda961c974c2c0 Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Fri, 26 Feb 2021 01:55:48 -0500 Subject: [PATCH 36/46] DOC: add example for reindex (#4956) * DOC: add example for reindex * rm white space * use arange --- xarray/core/dataarray.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 34354da61e2..e6209b0604b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1498,6 +1498,26 @@ def reindex( Another dataset array, with this array's data but replaced coordinates. + Examples + -------- + Reverse latitude: + + >>> da = xr.DataArray( + ... np.arange(4), + ... coords=[np.array([90, 89, 88, 87])], + ... dims="lat", + ... ) + >>> da + + array([0, 1, 2, 3]) + Coordinates: + * lat (lat) int64 90 89 88 87 + >>> da.reindex(lat=da.lat[::-1]) + + array([3, 2, 1, 0]) + Coordinates: + * lat (lat) int64 87 88 89 90 + See Also -------- DataArray.reindex_like From 318816a8304cf637417e889e563eafe63fde7c12 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Fri, 26 Feb 2021 11:11:42 -0800 Subject: [PATCH 37/46] Refinements to how-to-release (#4964) --- HOW_TO_RELEASE.md | 55 ++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md index 5352d427909..e30b05c40e2 100644 --- a/HOW_TO_RELEASE.md +++ b/HOW_TO_RELEASE.md @@ -21,9 +21,9 @@ upstream https://github.com/pydata/xarray (push) 2. Confirm there are no commits on stable that are not yet merged ([ref](https://github.com/pydata/xarray/pull/4440)): ```sh - git merge upstream stable + git merge upstream/stable ``` - 2. Add a list of contributors with: + 3. Add a list of contributors with: ```sh git log "$(git tag --sort="v:refname" | sed -n 'x;$p').." --format=%aN | sort -u | perl -pe 's/\n/$1, /' ``` @@ -35,9 +35,9 @@ upstream https://github.com/pydata/xarray (push) ```sh git log v{0.X.Y-1}.. --format=%aN | sort -u | wc -l ``` - 3. Write a release summary: ~50 words describing the high level features. This + 4. Write a release summary: ~50 words describing the high level features. This will be used in the release emails, tweets, GitHub release notes, etc. - 4. Look over whats-new.rst and the docs. Make sure "What's New" is complete + 5. Look over whats-new.rst and the docs. Make sure "What's New" is complete (check the date!) and add the release summary at the top. Things to watch out for: - Important new features should be highlighted towards the top. @@ -46,46 +46,48 @@ upstream https://github.com/pydata/xarray (push) due to a bad merge. Check for these before a release by using git diff, e.g., `git diff v{0.X.Y-1} whats-new.rst` where {0.X.Y-1} is the previous release. - 5. If possible, open a PR with the release summary and whatsnew changes. - 6. After merging, again ensure your master branch is synced to upstream: + 6. Open a PR with the release summary and whatsnew changes; in particular the + release headline should get feedback from the team on what's important to include. + 7. After merging, again ensure your master branch is synced to upstream: ```sh git pull upstream master ``` - 7. If you have any doubts, run the full test suite one final time! + 8. If you have any doubts, run the full test suite one final time! ```sh pytest ``` - 8. Check that the ReadTheDocs build is passing. - 9. On the master branch, commit the release in git: - ```sh - git commit -am 'Release v{0.X.Y}' - ``` + 9. Check that the ReadTheDocs build is passing. 10. Tag the release: ```sh git tag -a v{0.X.Y} -m 'v{0.X.Y}' ``` -11. Build source and binary wheels for PyPI: +11. Ensure the dependencies for building are installed: ```sh - git clean -xdf # this deletes all uncommitted changes! + pip install setuptools-scm twine wheel + ``` +12. Build source and binary wheels for PyPI: + ```sh + git clean -xdf # This removes any untracked files! + git restore -SW . # This removes any tracked changes! python setup.py bdist_wheel sdist ``` -12. Use twine to check the package build: +13. Use twine to check the package build: ```sh twine check dist/xarray-{0.X.Y}* ``` -13. Use twine to register and upload the release on PyPI. Be careful, you can't +14. Use twine to register and upload the release on PyPI. Be careful, you can't take this back! ```sh twine upload dist/xarray-{0.X.Y}* ``` You will need to be listed as a package owner at for this to work. -14. Push your changes to master: +15. Push your changes to master: ```sh git push upstream master git push upstream --tags ``` -15. Update the stable branch (used by ReadTheDocs) and switch back to master: +16. Update the stable branch (used by ReadTheDocs) and switch back to master: ```sh git switch stable git rebase master @@ -95,18 +97,22 @@ upstream https://github.com/pydata/xarray (push) It's OK to force push to `stable` if necessary. (We also update the stable branch with `git cherry-pick` for documentation only fixes that apply the current released version.) -16. Add a section for the next release {0.X.Y+1} to doc/whats-new.rst: +17. Add a section for the next release {0.X.Y+1} to doc/whats-new.rst: ```rst .. _whats-new.{0.X.Y+1}: v{0.X.Y+1} (unreleased) --------------------- + New Features + ~~~~~~~~~~~~ + + Breaking changes ~~~~~~~~~~~~~~~~ - New Features + Deprecations ~~~~~~~~~~~~ @@ -120,20 +126,21 @@ upstream https://github.com/pydata/xarray (push) Internal Changes ~~~~~~~~~~~~~~~~ + ``` -17. Commit your changes and push to master again: +18. Commit your changes and push to master again: ```sh git commit -am 'New whatsnew section' git push upstream master ``` You're done pushing to master! -18. Issue the release on GitHub. Click on "Draft a new release" at +19. Issue the release on GitHub. Click on "Draft a new release" at . Type in the version number and paste the release summary in the notes. -19. Update the docs. Login to +20. Update the docs. Login to and switch your new release tag (at the bottom) from "Inactive" to "Active". It should now build automatically. -20. Issue the release announcement to mailing lists & Twitter. For bug fix releases, I +21. Issue the release announcement to mailing lists & Twitter. For bug fix releases, I usually only email xarray@googlegroups.com. For major/feature releases, I will email a broader list (no more than once every 3-6 months): - pydata@googlegroups.com From 48378c4b11c5c2672ff91396d4284743165b4fbe Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 27 Feb 2021 04:00:25 -0800 Subject: [PATCH 38/46] Whatsnew for 0.17.1 (#4963) --- doc/whats-new.rst | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ef7f5b43fdd..eed4e16eb62 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,34 @@ What's New np.random.seed(123456) +.. _whats-new.0.17.1: + +v0.17.1 (unreleased) +--------------------- + + +New Features +~~~~~~~~~~~~ + + +Breaking changes +~~~~~~~~~~~~~~~~ + + +Deprecations +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ .. _whats-new.0.17.0: From 66acafa7f1f1477cfd6c5b7c3458859763433092 Mon Sep 17 00:00:00 2001 From: keewis Date: Thu, 4 Mar 2021 01:45:43 +0100 Subject: [PATCH 39/46] raise on passing axis to Dataset.reduce methods (#4940) * don't allow passing 'axis' to Dataset.reduce methods * check that Dataset.reduce raises for axis kwargs * update whats-new.rst [skip-ci] * remove the broken axis kwarg to Dataset argmin / argmax * remove tests which depended on axis being passed through by **kwargs * don't try to test calling numpy reduce functions on dataset objects * Update doc/whats-new.rst Co-authored-by: Mathias Hauser Co-authored-by: Mathias Hauser --- doc/whats-new.rst | 3 ++- xarray/core/dataset.py | 45 ++++++++++++++++-------------------- xarray/tests/test_dataset.py | 9 +++----- xarray/tests/test_units.py | 29 ----------------------- 4 files changed, 25 insertions(+), 61 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index eed4e16eb62..7c3d57f4fe8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,7 +34,8 @@ Deprecations Bug fixes ~~~~~~~~~ - +- Don't allow passing ``axis`` to :py:meth:`Dataset.reduce` methods (:issue:`3510`, :pull:`4940`). + By `Justus Magin `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9faf74dd4bc..cbc30dddda9 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4665,6 +4665,12 @@ def reduce( Dataset with this object's DataArrays replaced with new DataArrays of summarized data and the indicated dimension(s) removed. """ + if "axis" in kwargs: + raise ValueError( + "passing 'axis' to Dataset reduce methods is ambiguous." + " Please use 'dim' instead." + ) + if dim is None or dim is ...: dims = set(self.dims) elif isinstance(dim, str) or not isinstance(dim, Iterable): @@ -6854,7 +6860,7 @@ def idxmax( ) ) - def argmin(self, dim=None, axis=None, **kwargs): + def argmin(self, dim=None, **kwargs): """Indices of the minima of the member variables. If there are multiple minima, the indices of the first one found will be @@ -6868,9 +6874,6 @@ def argmin(self, dim=None, axis=None, **kwargs): this is deprecated, in future will be an error, since DataArray.argmin will return a dict with indices for all dimensions, which does not make sense for a Dataset. - axis : int, optional - Axis over which to apply `argmin`. Only one of the 'dim' and 'axis' arguments - can be supplied. keep_attrs : bool, optional If True, the attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be @@ -6888,28 +6891,25 @@ def argmin(self, dim=None, axis=None, **kwargs): See Also -------- DataArray.argmin - """ - if dim is None and axis is None: + if dim is None: warnings.warn( - "Once the behaviour of DataArray.argmin() and Variable.argmin() with " - "neither dim nor axis argument changes to return a dict of indices of " - "each dimension, for consistency it will be an error to call " - "Dataset.argmin() with no argument, since we don't return a dict of " - "Datasets.", + "Once the behaviour of DataArray.argmin() and Variable.argmin() without " + "dim changes to return a dict of indices of each dimension, for " + "consistency it will be an error to call Dataset.argmin() with no argument," + "since we don't return a dict of Datasets.", DeprecationWarning, stacklevel=2, ) if ( dim is None - or axis is not None or (not isinstance(dim, Sequence) and dim is not ...) or isinstance(dim, str) ): # Return int index if single dimension is passed, and is not part of a # sequence argmin_func = getattr(duck_array_ops, "argmin") - return self.reduce(argmin_func, dim=dim, axis=axis, **kwargs) + return self.reduce(argmin_func, dim=dim, **kwargs) else: raise ValueError( "When dim is a sequence or ..., DataArray.argmin() returns a dict. " @@ -6917,7 +6917,7 @@ def argmin(self, dim=None, axis=None, **kwargs): "Dataset.argmin() with a sequence or ... for dim" ) - def argmax(self, dim=None, axis=None, **kwargs): + def argmax(self, dim=None, **kwargs): """Indices of the maxima of the member variables. If there are multiple maxima, the indices of the first one found will be @@ -6931,9 +6931,6 @@ def argmax(self, dim=None, axis=None, **kwargs): this is deprecated, in future will be an error, since DataArray.argmax will return a dict with indices for all dimensions, which does not make sense for a Dataset. - axis : int, optional - Axis over which to apply `argmax`. Only one of the 'dim' and 'axis' arguments - can be supplied. keep_attrs : bool, optional If True, the attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be @@ -6953,26 +6950,24 @@ def argmax(self, dim=None, axis=None, **kwargs): DataArray.argmax """ - if dim is None and axis is None: + if dim is None: warnings.warn( - "Once the behaviour of DataArray.argmax() and Variable.argmax() with " - "neither dim nor axis argument changes to return a dict of indices of " - "each dimension, for consistency it will be an error to call " - "Dataset.argmax() with no argument, since we don't return a dict of " - "Datasets.", + "Once the behaviour of DataArray.argmin() and Variable.argmin() without " + "dim changes to return a dict of indices of each dimension, for " + "consistency it will be an error to call Dataset.argmin() with no argument," + "since we don't return a dict of Datasets.", DeprecationWarning, stacklevel=2, ) if ( dim is None - or axis is not None or (not isinstance(dim, Sequence) and dim is not ...) or isinstance(dim, str) ): # Return int index if single dimension is passed, and is not part of a # sequence argmax_func = getattr(duck_array_ops, "argmax") - return self.reduce(argmax_func, dim=dim, axis=axis, **kwargs) + return self.reduce(argmax_func, dim=dim, **kwargs) else: raise ValueError( "When dim is a sequence or ..., DataArray.argmin() returns a dict. " diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2118bc8b780..8afd6d68ab4 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -4746,6 +4746,9 @@ def test_reduce(self): assert_equal(data.mean(dim=[]), data) + with pytest.raises(ValueError): + data.mean(axis=0) + def test_reduce_coords(self): # regression test for GH1470 data = xr.Dataset({"a": ("x", [1, 2, 3])}, coords={"b": 4}) @@ -4926,9 +4929,6 @@ def mean_only_one_axis(x, axis): with raises_regex(TypeError, "missing 1 required positional argument: 'axis'"): ds.reduce(mean_only_one_axis) - with raises_regex(TypeError, "non-integer axis"): - ds.reduce(mean_only_one_axis, axis=["x", "y"]) - def test_reduce_no_axis(self): def total_sum(x): return np.sum(x.flatten()) @@ -4938,9 +4938,6 @@ def total_sum(x): actual = ds.reduce(total_sum) assert_identical(expected, actual) - with raises_regex(TypeError, "unexpected keyword argument 'axis'"): - ds.reduce(total_sum, axis=0) - with raises_regex(TypeError, "unexpected keyword argument 'axis'"): ds.reduce(total_sum, dim="x") diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 76dd830de23..8b7835e5da6 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -3972,35 +3972,6 @@ def test_repr(self, func, variant, dtype): @pytest.mark.parametrize( "func", ( - function("all"), - function("any"), - pytest.param( - function("argmax"), - marks=pytest.mark.skip( - reason="calling np.argmax as a function on xarray objects is not " - "supported" - ), - ), - pytest.param( - function("argmin"), - marks=pytest.mark.skip( - reason="calling np.argmin as a function on xarray objects is not " - "supported" - ), - ), - function("max"), - function("min"), - function("mean"), - pytest.param( - function("median"), - marks=pytest.mark.xfail(reason="median does not work with dataset yet"), - ), - function("sum"), - function("prod"), - function("std"), - function("var"), - function("cumsum"), - function("cumprod"), method("all"), method("any"), method("argmax", dim="x"), From 37522e991a32ee3c0ad1a5ff8afe8e3eb1885550 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 5 Mar 2021 09:24:14 +0000 Subject: [PATCH 40/46] Support for dask.graph_manipulation (#4965) * Support dask.graph_manipulation * fix * What's New * [test-upstream] --- doc/whats-new.rst | 4 +- xarray/core/dataarray.py | 8 +-- xarray/core/dataset.py | 107 +++++++++++++++++++++----------------- xarray/core/variable.py | 17 ++---- xarray/tests/test_dask.py | 35 +++++++++++++ 5 files changed, 106 insertions(+), 65 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7c3d57f4fe8..9e59fdc5b35 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,7 +22,9 @@ v0.17.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Support for `dask.graph_manipulation + `_ (requires dask >=2021.3) + By `Guido Imperiale `_ Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e6209b0604b..dd871eb21bc 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -839,15 +839,15 @@ def __dask_scheduler__(self): def __dask_postcompute__(self): func, args = self._to_temp_dataset().__dask_postcompute__() - return self._dask_finalize, (func, args, self.name) + return self._dask_finalize, (self.name, func) + args def __dask_postpersist__(self): func, args = self._to_temp_dataset().__dask_postpersist__() - return self._dask_finalize, (func, args, self.name) + return self._dask_finalize, (self.name, func) + args @staticmethod - def _dask_finalize(results, func, args, name): - ds = func(results, *args) + def _dask_finalize(results, name, func, *args, **kwargs): + ds = func(results, *args, **kwargs) variable = ds._variables.pop(_THIS_ARRAY) coords = ds._variables return DataArray(variable, coords, name=name, fastpath=True) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cbc30dddda9..a4001c747bb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -863,15 +863,25 @@ def __dask_scheduler__(self): return da.Array.__dask_scheduler__ def __dask_postcompute__(self): + return self._dask_postcompute, () + + def __dask_postpersist__(self): + return self._dask_postpersist, () + + def _dask_postcompute(self, results: "Iterable[Variable]") -> "Dataset": import dask - info = [ - (k, None) + v.__dask_postcompute__() - if dask.is_dask_collection(v) - else (k, v, None, None) - for k, v in self._variables.items() - ] - construct_direct_args = ( + variables = {} + results_iter = iter(results) + + for k, v in self._variables.items(): + if dask.is_dask_collection(v): + rebuild, args = v.__dask_postcompute__() + v = rebuild(next(results_iter), *args) + variables[k] = v + + return Dataset._construct_direct( + variables, self._coord_names, self._dims, self._attrs, @@ -879,18 +889,50 @@ def __dask_postcompute__(self): self._encoding, self._close, ) - return self._dask_postcompute, (info, construct_direct_args) - def __dask_postpersist__(self): - import dask + def _dask_postpersist( + self, dsk: Mapping, *, rename: Mapping[str, str] = None + ) -> "Dataset": + from dask import is_dask_collection + from dask.highlevelgraph import HighLevelGraph + from dask.optimization import cull - info = [ - (k, None, v.__dask_keys__()) + v.__dask_postpersist__() - if dask.is_dask_collection(v) - else (k, v, None, None, None) - for k, v in self._variables.items() - ] - construct_direct_args = ( + variables = {} + + for k, v in self._variables.items(): + if not is_dask_collection(v): + variables[k] = v + continue + + if isinstance(dsk, HighLevelGraph): + # dask >= 2021.3 + # __dask_postpersist__() was called by dask.highlevelgraph. + # Don't use dsk.cull(), as we need to prevent partial layers: + # https://github.com/dask/dask/issues/7137 + layers = v.__dask_layers__() + if rename: + layers = [rename.get(k, k) for k in layers] + dsk2 = dsk.cull_layers(layers) + elif rename: # pragma: nocover + # At the moment of writing, this is only for forward compatibility. + # replace_name_in_key requires dask >= 2021.3. + from dask.base import flatten, replace_name_in_key + + keys = [ + replace_name_in_key(k, rename) for k in flatten(v.__dask_keys__()) + ] + dsk2, _ = cull(dsk, keys) + else: + # __dask_postpersist__() was called by dask.optimize or dask.persist + dsk2, _ = cull(dsk, v.__dask_keys__()) + + rebuild, args = v.__dask_postpersist__() + # rename was added in dask 2021.3 + kwargs = {"rename": rename} if rename else {} + variables[k] = rebuild(dsk2, *args, **kwargs) + + return Dataset._construct_direct( + variables, self._coord_names, self._dims, self._attrs, @@ -898,37 +940,6 @@ def __dask_postpersist__(self): self._encoding, self._close, ) - return self._dask_postpersist, (info, construct_direct_args) - - @staticmethod - def _dask_postcompute(results, info, construct_direct_args): - variables = {} - results_iter = iter(results) - for k, v, rebuild, rebuild_args in info: - if v is None: - variables[k] = rebuild(next(results_iter), *rebuild_args) - else: - variables[k] = v - - final = Dataset._construct_direct(variables, *construct_direct_args) - return final - - @staticmethod - def _dask_postpersist(dsk, info, construct_direct_args): - from dask.optimization import cull - - variables = {} - # postpersist is called in both dask.optimize and dask.persist - # When persisting, we want to filter out unrelated keys for - # each Variable's task graph. - for k, v, dask_keys, rebuild, rebuild_args in info: - if v is None: - dsk2, _ = cull(dsk, dask_keys) - variables[k] = rebuild(dsk2, *rebuild_args) - else: - variables[k] = v - - return Dataset._construct_direct(variables, *construct_direct_args) def compute(self, **kwargs) -> "Dataset": """Manually trigger loading and/or computation of this dataset's data diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9b70f721689..c59cbf1f3e4 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -531,22 +531,15 @@ def __dask_scheduler__(self): def __dask_postcompute__(self): array_func, array_args = self._data.__dask_postcompute__() - return ( - self._dask_finalize, - (array_func, array_args, self._dims, self._attrs, self._encoding), - ) + return self._dask_finalize, (array_func,) + array_args def __dask_postpersist__(self): array_func, array_args = self._data.__dask_postpersist__() - return ( - self._dask_finalize, - (array_func, array_args, self._dims, self._attrs, self._encoding), - ) + return self._dask_finalize, (array_func,) + array_args - @staticmethod - def _dask_finalize(results, array_func, array_args, dims, attrs, encoding): - data = array_func(results, *array_args) - return Variable(dims, data, attrs=attrs, encoding=encoding) + def _dask_finalize(self, results, array_func, *args, **kwargs): + data = array_func(results, *args, **kwargs) + return Variable(self._dims, data, attrs=self._attrs, encoding=self._encoding) @property def values(self): diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 8220c8b83dc..908a959db45 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1599,3 +1599,38 @@ def test_optimize(): arr = xr.DataArray(a).chunk(5) (arr2,) = dask.optimize(arr) arr2.compute() + + +# The graph_manipulation module is in dask since 2021.2 but it became usable with +# xarray only since 2021.3 +@pytest.mark.skipif(LooseVersion(dask.__version__) <= "2021.02.0", reason="new module") +def test_graph_manipulation(): + """dask.graph_manipulation passes an optional parameter, "rename", to the rebuilder + function returned by __dask_postperist__; also, the dsk passed to the rebuilder is + a HighLevelGraph whereas with dask.persist() and dask.optimize() it's a plain dict. + """ + import dask.graph_manipulation as gm + + v = Variable(["x"], [1, 2]).chunk(-1).chunk(1) * 2 + da = DataArray(v) + ds = Dataset({"d1": v[0], "d2": v[1], "d3": ("x", [3, 4])}) + + v2, da2, ds2 = gm.clone(v, da, ds) + + assert_equal(v2, v) + assert_equal(da2, da) + assert_equal(ds2, ds) + + for a, b in ((v, v2), (da, da2), (ds, ds2)): + assert a.__dask_layers__() != b.__dask_layers__() + assert len(a.__dask_layers__()) == len(b.__dask_layers__()) + assert a.__dask_graph__().keys() != b.__dask_graph__().keys() + assert len(a.__dask_graph__()) == len(b.__dask_graph__()) + assert a.__dask_graph__().layers.keys() != b.__dask_graph__().layers.keys() + assert len(a.__dask_graph__().layers) == len(b.__dask_graph__().layers) + + # Above we performed a slice operation; adding the two slices back together creates + # a diamond-shaped dependency graph, which in turn will trigger a collision in layer + # names if we were to use HighLevelGraph.cull() instead of + # HighLevelGraph.cull_layers() in Dataset.__dask_postpersist__(). + assert_equal(ds2.d1 + ds2.d2, ds.d1 + ds.d2) From 229829f18cca4519a75a414bcc301e559462bc9d Mon Sep 17 00:00:00 2001 From: keewis Date: Sat, 6 Mar 2021 23:41:19 +0100 Subject: [PATCH 41/46] add a combine_attrs parameter to Dataset.merge (#4895) * add a combine_attrs kwarg to Dataset.merge * document the new drop_conflicts value * test that combine_attrs is passed through * fix the documented default of combine_attrs * update whats-new.rst * minor fix [skip-ci] * minor fix [skip-ci] * remove a empty line [skip-ci] * fix bad merge [skip-ci] * fix bad merge [skip-ci] * remove the blank line after rst lists [skip-ci] --- doc/whats-new.rst | 2 ++ xarray/core/dataset.py | 15 ++++++++++++++- xarray/core/merge.py | 8 +++++++- xarray/tests/test_merge.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9e59fdc5b35..9c16fb74a7b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,8 @@ v0.17.1 (unreleased) New Features ~~~~~~~~~~~~ +- Allow passing ``combine_attrs`` to :py:meth:`Dataset.merge` (:pull:`4895`). + By `Justus Magin `_. - Support for `dask.graph_manipulation `_ (requires dask >=2021.3) By `Guido Imperiale `_ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index a4001c747bb..db45157e7c1 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3917,6 +3917,7 @@ def merge( compat: str = "no_conflicts", join: str = "outer", fill_value: Any = dtypes.NA, + combine_attrs: str = "override", ) -> "Dataset": """Merge the arrays of two datasets into a single dataset. @@ -3945,7 +3946,6 @@ def merge( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. - join : {"outer", "inner", "left", "right", "exact"}, optional Method for joining ``self`` and ``other`` along shared dimensions: @@ -3957,6 +3957,18 @@ def merge( fill_value : scalar or dict-like, optional Value to use for newly missing values. If a dict-like, maps variable names (including coordinates) to fill values. + combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \ + "override"}, default: "override" + String indicating how to combine attrs of the objects being merged: + + - "drop": empty attrs on returned Dataset. + - "identical": all attrs must be the same on every object. + - "no_conflicts": attrs from all objects are combined, any that have + the same name must also have the same value. + - "drop_conflicts": attrs from all objects are combined, any that have + the same name but different values are dropped. + - "override": skip comparing and copy attrs from the first dataset to + the result. Returns ------- @@ -3976,6 +3988,7 @@ def merge( compat=compat, join=join, fill_value=fill_value, + combine_attrs=combine_attrs, ) return self._replace(**merge_result._asdict()) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 14beeff3db5..ec95563bda9 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -893,6 +893,7 @@ def dataset_merge_method( compat: str, join: str, fill_value: Any, + combine_attrs: str, ) -> _MergeResult: """Guts of the Dataset.merge method.""" # we are locked into supporting overwrite_vars for the Dataset.merge @@ -922,7 +923,12 @@ def dataset_merge_method( priority_arg = 2 return merge_core( - objs, compat, join, priority_arg=priority_arg, fill_value=fill_value + objs, + compat, + join, + priority_arg=priority_arg, + fill_value=fill_value, + combine_attrs=combine_attrs, ) diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 27e2b10dcbc..5b84eccca14 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -418,3 +418,34 @@ def test_merge_dataarray(self): da = xr.DataArray(data=1, name="b") assert_identical(ds.merge(da), xr.merge([ds, da])) + + @pytest.mark.parametrize( + ["combine_attrs", "attrs1", "attrs2", "expected_attrs", "expect_error"], + # don't need to test thoroughly + ( + ("drop", {"a": 0, "b": 1, "c": 2}, {"a": 1, "b": 2, "c": 3}, {}, False), + ( + "drop_conflicts", + {"a": 0, "b": 1, "c": 2}, + {"b": 2, "c": 2, "d": 3}, + {"a": 0, "c": 2, "d": 3}, + False, + ), + ("override", {"a": 0, "b": 1}, {"a": 1, "b": 2}, {"a": 0, "b": 1}, False), + ("no_conflicts", {"a": 0, "b": 1}, {"a": 0, "b": 2}, None, True), + ("identical", {"a": 0, "b": 1}, {"a": 0, "b": 2}, None, True), + ), + ) + def test_merge_combine_attrs( + self, combine_attrs, attrs1, attrs2, expected_attrs, expect_error + ): + ds1 = xr.Dataset(attrs=attrs1) + ds2 = xr.Dataset(attrs=attrs2) + + if expect_error: + with pytest.raises(xr.MergeError): + ds1.merge(ds2, combine_attrs=combine_attrs) + else: + actual = ds1.merge(ds2, combine_attrs=combine_attrs) + expected = xr.Dataset(attrs=expected_attrs) + assert_identical(actual, expected) From dfe50b89c53f72bb1fb54fd7ea0c0ebd9a712f8d Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 6 Mar 2021 15:02:01 -0800 Subject: [PATCH 42/46] Adjust tests to use updated pandas syntax for offsets (#4537) * Adjust tests to use updated pandas syntax for offsets * Revert irrelevant change * Retain calcs for `actual` * Update xarray/tests/test_dataarray.py Co-authored-by: keewis * Apply suggestions from code review * fix imports Co-authored-by: keewis Co-authored-by: Keewis --- xarray/tests/test_dataarray.py | 9 +++++++-- xarray/tests/test_dataset.py | 13 ++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index b28a53023ed..259e91083a7 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import pytest +from pandas.tseries.frequencies import to_offset import xarray as xr from xarray import ( @@ -2990,9 +2991,13 @@ def test_resample(self): actual = array.resample(time="24H").reduce(np.mean) assert_identical(expected, actual) + # Our use of `loffset` may change if we align our API with pandas' changes. + # ref https://github.com/pydata/xarray/pull/4537 actual = array.resample(time="24H", loffset="-12H").mean() - expected = DataArray(array.to_series().resample("24H", loffset="-12H").mean()) - assert_identical(expected, actual) + expected_ = array.to_series().resample("24H").mean() + expected_.index += to_offset("-12H") + expected = DataArray.from_series(expected_) + assert_identical(actual, expected) with raises_regex(ValueError, "index must be monotonic"): array[[2, 0, 1]].resample(time="1D") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8afd6d68ab4..13cd03acf96 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -9,6 +9,7 @@ import pandas as pd import pytest from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.tseries.frequencies import to_offset import xarray as xr from xarray import ( @@ -3899,11 +3900,13 @@ def test_resample_loffset(self): ) ds.attrs["dsmeta"] = "dsdata" - actual = ds.resample(time="24H", loffset="-12H").mean("time").time - expected = xr.DataArray( - ds.bar.to_series().resample("24H", loffset="-12H").mean() - ).time - assert_identical(expected, actual) + # Our use of `loffset` may change if we align our API with pandas' changes. + # ref https://github.com/pydata/xarray/pull/4537 + actual = ds.resample(time="24H", loffset="-12H").mean().bar + expected_ = ds.bar.to_series().resample("24H").mean() + expected_.index += to_offset("-12H") + expected = DataArray.from_series(expected_) + assert_identical(actual, expected) def test_resample_by_mean_discarding_attrs(self): times = pd.date_range("2000-01-01", freq="6H", periods=10) From 67903ff08ec9ea1b5c259df634dc65444ae97eb6 Mon Sep 17 00:00:00 2001 From: Xianxiang Li Date: Sun, 7 Mar 2021 09:38:19 +0800 Subject: [PATCH 43/46] Update options.py (#5000) Add docstring for set_option('display_max_rows') --- xarray/core/options.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/options.py b/xarray/core/options.py index d421b4c4f17..129698903c4 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -85,6 +85,7 @@ class set_options: - ``display_width``: maximum display width for ``repr`` on xarray objects. Default: ``80``. + - ``display_max_rows``: maximum display rows. Default: ``12``. - ``arithmetic_join``: DataArray/Dataset alignment in binary operations. Default: ``'inner'``. - ``file_cache_maxsize``: maximum number of open files to hold in xarray's From b610a3c4317474b4b999c23cf66d1dc55c9b3cd6 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Sun, 7 Mar 2021 08:22:02 -0500 Subject: [PATCH 44/46] Adapt exception handling in CFTimeIndex.__sub__ and __rsub__ (#5006) The latest version of pandas raises an OutOfBoundsTimedelta error instead of an Overflow error. --- xarray/coding/cftimeindex.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 948bff1056a..aafd620c7bf 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -59,6 +59,12 @@ REPR_ELLIPSIS_SHOW_ITEMS_FRONT_END = 10 +if LooseVersion(pd.__version__) > LooseVersion("1.2.3"): + OUT_OF_BOUNDS_TIMEDELTA_ERROR = pd.errors.OutOfBoundsTimedelta +else: + OUT_OF_BOUNDS_TIMEDELTA_ERROR = OverflowError + + def named(name, pattern): return "(?P<" + name + ">" + pattern + ")" @@ -562,7 +568,7 @@ def __sub__(self, other): elif _contains_cftime_datetimes(np.array(other)): try: return pd.TimedeltaIndex(np.array(self) - np.array(other)) - except OverflowError: + except OUT_OF_BOUNDS_TIMEDELTA_ERROR: raise ValueError( "The time difference exceeds the range of values " "that can be expressed at the nanosecond resolution." @@ -573,7 +579,7 @@ def __sub__(self, other): def __rsub__(self, other): try: return pd.TimedeltaIndex(other - np.array(self)) - except OverflowError: + except OUT_OF_BOUNDS_TIMEDELTA_ERROR: raise ValueError( "The time difference exceeds the range of values " "that can be expressed at the nanosecond resolution." From 54d581791467325c0d90c0c53b014a37670b8a0a Mon Sep 17 00:00:00 2001 From: keewis Date: Sun, 7 Mar 2021 14:31:37 +0100 Subject: [PATCH 45/46] fix matplotlib errors for single level discrete colormaps (#4256) * duplicate the level if a single level was passed * don't handle scalars it seems these will never be passed at that level * [test-upstream] [skip-ci] --- xarray/plot/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index 5510cf7f219..a83bc28e273 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -60,6 +60,9 @@ def _build_discrete_cmap(cmap, levels, extend, filled): """ import matplotlib as mpl + if len(levels) == 1: + levels = [levels[0], levels[0]] + if not filled: # non-filled contour plots extend = "max" From 7905c514a12fcbcaaeb634cab94733c7cbdd6ff2 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 8 Mar 2021 01:20:38 +0100 Subject: [PATCH 46/46] pin netCDF4=1.5.3 in min-all-deps (#4982) --- ci/requirements/py37-min-all-deps.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/requirements/py37-min-all-deps.yml b/ci/requirements/py37-min-all-deps.yml index c2fd2e18a8a..7d04f431935 100644 --- a/ci/requirements/py37-min-all-deps.yml +++ b/ci/requirements/py37-min-all-deps.yml @@ -25,7 +25,9 @@ dependencies: - lxml=4.5 # Optional dep of pydap - matplotlib-base=3.1 - nc-time-axis=1.2 - - netcdf4=1.5 +# netcdf follows a 1.major.minor[.patch] convention (see https://github.com/Unidata/netcdf4-python/issues/1090) +# bumping the netCDF4 version is currently blocked by #4491 + - netcdf4=1.5.3 - numba=0.48 - numpy=1.17 - pandas=1.0